Spaces:

wenlianghuang
/

Deep-Agent-Tool

Sleeping

App Files Files Community

wenlianghuang commited on Jan 5

Commit

979763a

1 Parent(s): ee8f3fd

combine src of advanced RAG

Browse files

Files changed (16) hide show

deep_agent_rag/rag/private_file_rag.py +26 -161
src/__init__.py +37 -0
src/document_processor.py +590 -0
src/hybrid_subquery_hyde_rag.py +399 -0
src/hyde_rag.py +235 -0
src/llm_integration.py +246 -0
src/prompt_formatter.py +395 -0
src/retrievers/__init__.py +17 -0
src/retrievers/base.py +32 -0
src/retrievers/bm25_retriever.py +127 -0
src/retrievers/hybrid_search.py +298 -0
src/retrievers/reranker.py +448 -0
src/retrievers/vector_retriever.py +254 -0
src/step_back_rag.py +305 -0
src/subquery_rag.py +361 -0
src/triple_hybrid_rag.py +467 -0

deep_agent_rag/rag/private_file_rag.py CHANGED Viewed

@@ -24,34 +24,26 @@ from langchain_core.messages import HumanMessage
 from .llm_adapter import LangChainLLMAdapter
 from .adaptive_rag_selector import AdaptiveRAGSelector, RAGMethod
-# 添加 Learn_RAG 到 Python 路徑
-# 計算 Learn_RAG 的路徑（與 Deep_Agentic_AI_Tool 在同一目錄下）
 current_file = Path(__file__).resolve()
 # 從 deep_agent_rag/rag/private_file_rag.py 向上找到 Deep_Agentic_AI_Tool 根目錄
 deep_agent_root = current_file.parent.parent.parent.parent
-learn_rag_path = deep_agent_root.parent / "Learn_RAG"
-# 如果 Learn_RAG 不在預期位置，嘗試其他可能的位置
-if not learn_rag_path.exists():
-    # 嘗試當前工作目錄的父目錄
-    cwd = Path.cwd()
-    learn_rag_path = cwd.parent / "Learn_RAG"
-    if not learn_rag_path.exists():
-        # 嘗試直接使用絕對路徑
-        learn_rag_path = Path("/Users/matthuang/Desktop/Learn_RAG")
-# 將 Learn_RAG 目錄添加到 Python 路徑（這樣可以導入 src 模組）
-# 注意：需要將 Learn_RAG 目錄本身添加到路徑，因為 src 模組在 Learn_RAG/src/ 下
-if learn_rag_path.exists() and learn_rag_path.is_dir():
-    if str(learn_rag_path) not in sys.path:
-        sys.path.insert(0, str(learn_rag_path))
-    print(f"✓ 找到 Learn_RAG 項目: {learn_rag_path}")
-    print(f"  Python 路徑已添加: {learn_rag_path}")
 else:
-    print(f"⚠️ 無法找到 Learn_RAG 項目")
-    print(f"   嘗試的路徑: {learn_rag_path}")
-    print(f"   請確保 Learn_RAG 項目在: {deep_agent_root.parent / 'Learn_RAG'}")
 # 嘗試導入 Learn_RAG 模組
 # 注意：document_processor.py 在頂層導入了 arxiv，所以需要先安裝依賴
@@ -78,13 +70,13 @@ try:
     if missing_deps:
         print(f"⚠️ 缺少以下依賴包: {', '.join(missing_deps)}")
-        print(f"\n💡 請安裝 Learn_RAG 項目的依賴:")
         print(f"   方法 1: 使用 pip")
         print(f"   pip install {' '.join(missing_deps)}")
-        print(f"\n   方法 2: 使用 uv (推薦，如果 Learn_RAG 使用 uv)")
-        print(f"   cd {learn_rag_path}")
         print(f"   uv sync")
-        print(f"\n   方法 3: 安裝所有 Learn_RAG 依賴")
         print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
         LEARN_RAG_AVAILABLE = False
     else:
@@ -104,22 +96,23 @@ try:
         # 不再需要導入 OllamaLLM，因為我們使用 Deep_Agentic_AI_Tool 的統一 LLM 系統（get_llm()）
         # from src.llm_integration import OllamaLLM
         LEARN_RAG_AVAILABLE = True
-        print("✓ 成功導入 Learn_RAG 模組（包含進階 RAG 方法）")
 except ImportError as e:
     error_msg = str(e)
-    print(f"⚠️ 無法導入 Learn_RAG 模組: {error_msg}")
-    print(f"\n💡 請安裝 Learn_RAG 項目的依賴:")
     print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
     print(f"\n   或者:")
-    print(f"   cd {learn_rag_path}")
     print(f"   uv sync")
     LEARN_RAG_AVAILABLE = False
 except Exception as e:
     error_msg = str(e)
-    print(f"⚠️ 導入 Learn_RAG 模組時發生錯誤: {error_msg}")
     print(f"   當前 Python 路徑: {sys.path[:3]}")
-    print(f"   Learn_RAG 路徑: {learn_rag_path}")
     LEARN_RAG_AVAILABLE = False
@@ -1236,134 +1229,6 @@ def reset_private_rag_instance():
     """重置全局實例"""
     global _private_rag_instance
     _private_rag_instance = None
-"""
-私有文件 RAG 系統
-集成 Learn_RAG 的功能，支持上傳私有文件（PDF、DOCX、TXT）並使用 RAG 回答問題
-LLM 使用策略：
-- 優先使用 Groq API（如果配置了 API 金鑰）
-- 其次使用 Ollama（如果服務正在運行）
-- 最後使用 MLX 本地模型（作為備選方案）
-"""
-import os
-import sys
-import time
-from pathlib import Path
-from typing import Optional, Dict, List, Tuple
-import tempfile
-import shutil
-# 導入 Deep_Agentic_AI_Tool 的 LLM 工具
-# 這樣可以使用統一的 LLM 優先順序策略（Groq -> Ollama -> MLX）
-from ..utils.llm_utils import get_llm
-from langchain_core.messages import HumanMessage
-# 導入 LLM 適配器和智能選擇器
-from .llm_adapter import LangChainLLMAdapter
-from .adaptive_rag_selector import AdaptiveRAGSelector, RAGMethod
-# 添加 Learn_RAG 到 Python 路徑
-# 計算 Learn_RAG 的路徑（與 Deep_Agentic_AI_Tool 在同一目錄下）
-current_file = Path(__file__).resolve()
-# 從 deep_agent_rag/rag/private_file_rag.py 向上找到 Deep_Agentic_AI_Tool 根目錄
-deep_agent_root = current_file.parent.parent.parent.parent
-learn_rag_path = deep_agent_root.parent / "Learn_RAG"
-# 如果 Learn_RAG 不在預期位置，嘗試其他可能的位置
-if not learn_rag_path.exists():
-    # 嘗試當前工作目錄的父目錄
-    cwd = Path.cwd()
-    learn_rag_path = cwd.parent / "Learn_RAG"
-    if not learn_rag_path.exists():
-        # 嘗試直接使用絕對路徑
-        learn_rag_path = Path("/Users/matthuang/Desktop/Learn_RAG")
-# 將 Learn_RAG 目錄添加到 Python 路徑（這樣可以導入 src 模組）
-# 注意：需要將 Learn_RAG 目錄本身添加到路徑，因為 src 模組在 Learn_RAG/src/ 下
-if learn_rag_path.exists() and learn_rag_path.is_dir():
-    if str(learn_rag_path) not in sys.path:
-        sys.path.insert(0, str(learn_rag_path))
-    print(f"✓ 找到 Learn_RAG 項目: {learn_rag_path}")
-    print(f"  Python 路徑已添加: {learn_rag_path}")
-else:
-    print(f"⚠️ 無法找到 Learn_RAG 項目")
-    print(f"   嘗試的路徑: {learn_rag_path}")
-    print(f"   請確保 Learn_RAG 項目在: {deep_agent_root.parent / 'Learn_RAG'}")
-# 嘗試導入 Learn_RAG 模組
-# 注意：document_processor.py 在頂層導入了 arxiv，所以需要先安裝依賴
-try:
-    # 先檢查必要的依賴是否已安裝
-    import importlib
-    required_deps = {
-        "arxiv": "arxiv",
-        "langchain_community": "langchain-community",
-        "langchain_text_splitters": "langchain-text-splitters",
-        "chromadb": "chromadb",
-        "sentence_transformers": "sentence-transformers",
-        "rank_bm25": "rank-bm25",
-        "pypdf": "pypdf",
-    }
-    missing_deps = []
-    for module_name, package_name in required_deps.items():
-        try:
-            importlib.import_module(module_name)
-        except ImportError:
-            missing_deps.append(package_name)
-    if missing_deps:
-        print(f"⚠️ 缺少以下依賴包: {', '.join(missing_deps)}")
-        print(f"\n💡 請安裝 Learn_RAG 項目的依賴:")
-        print(f"   方法 1: 使用 pip")
-        print(f"   pip install {' '.join(missing_deps)}")
-        print(f"\n   方法 2: 使用 uv (推薦，如果 Learn_RAG 使用 uv)")
-        print(f"   cd {learn_rag_path}")
-        print(f"   uv sync")
-        print(f"\n   方法 3: 安裝所有 Learn_RAG 依賴")
-        print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
-        LEARN_RAG_AVAILABLE = False
-    else:
-        # 所有依賴都已安裝，嘗試導入模組
-        from src.document_processor import DocumentProcessor
-        from src.retrievers.bm25_retriever import BM25Retriever
-        from src.retrievers.vector_retriever import VectorRetriever
-        from src.retrievers.hybrid_search import HybridSearch
-        from src.retrievers.reranker import Reranker, RAGPipeline
-        from src.prompt_formatter import PromptFormatter
-        # 導入進階 RAG 方法
-        from src.subquery_rag import SubQueryDecompositionRAG
-        from src.hyde_rag import HyDERAG
-        from src.step_back_rag import StepBackRAG
-        from src.hybrid_subquery_hyde_rag import HybridSubqueryHyDERAG
-        from src.triple_hybrid_rag import TripleHybridRAG
-        # 不再需要導入 OllamaLLM，因為我們使用 Deep_Agentic_AI_Tool 的統一 LLM 系統（get_llm()）
-        # from src.llm_integration import OllamaLLM
-        LEARN_RAG_AVAILABLE = True
-        print("✓ 成功導入 Learn_RAG 模組（包含進階 RAG 方法）")
-except ImportError as e:
-    error_msg = str(e)
-    print(f"⚠️ 無法導入 Learn_RAG 模組: {error_msg}")
-    print(f"\n💡 請安裝 Learn_RAG 項目的依賴:")
-    print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
-    print(f"\n   或者:")
-    print(f"   cd {learn_rag_path}")
-    print(f"   uv sync")
-    LEARN_RAG_AVAILABLE = False
-except Exception as e:
-    error_msg = str(e)
-    print(f"⚠️ 導入 Learn_RAG 模組時發生錯誤: {error_msg}")
-    print(f"   當前 Python 路徑: {sys.path[:3]}")
-    print(f"   Learn_RAG 路徑: {learn_rag_path}")
-    LEARN_RAG_AVAILABLE = False
-class PrivateFileRAG:
     """
     私有文件 RAG 系統管理器

 from .llm_adapter import LangChainLLMAdapter
 from .adaptive_rag_selector import AdaptiveRAGSelector, RAGMethod
+# 添加項目根目錄到 Python 路徑（這樣可以導入 src 模組）
+# 從 deep_agent_rag/rag/private_file_rag.py 向上找到 Deep_Agentic_AI_Tool 根目錄
 current_file = Path(__file__).resolve()
 # 從 deep_agent_rag/rag/private_file_rag.py 向上找到 Deep_Agentic_AI_Tool 根目錄
+# private_file_rag.py -> rag/ -> deep_agent_rag/ -> Deep_Agentic_AI_Tool/
 deep_agent_root = current_file.parent.parent.parent.parent
+# 檢查 src 目錄是否存在（應該在項目根目錄下）
+src_path = deep_agent_root / "src"
+if src_path.exists() and src_path.is_dir():
+    # 將項目根目錄添加到 Python 路徑（不是 src 目錄本身）
+    # 這樣可以通過 from src.xxx import xxx 導入
+    if str(deep_agent_root) not in sys.path:
+        sys.path.insert(0, str(deep_agent_root))
+    print(f"✓ 找到本地 src 模組: {src_path}")
+    print(f"  項目根目錄已添加到 Python 路徑: {deep_agent_root}")
 else:
+    print(f"⚠️ 無法找到 src 目錄")
+    print(f"   預期路徑: {src_path}")
+    print(f"   項目根目錄: {deep_agent_root}")
 # 嘗試導入 Learn_RAG 模組
 # 注意：document_processor.py 在頂層導入了 arxiv，所以需要先安裝依賴
     if missing_deps:
         print(f"⚠️ 缺少以下依賴包: {', '.join(missing_deps)}")
+        print(f"\n💡 請安裝 RAG 系統所需的依賴:")
         print(f"   方法 1: 使用 pip")
         print(f"   pip install {' '.join(missing_deps)}")
+        print(f"\n   方法 2: 使用 uv (推薦)")
+        print(f"   cd {deep_agent_root}")
         print(f"   uv sync")
+        print(f"\n   方法 3: 安裝所有依賴")
         print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
         LEARN_RAG_AVAILABLE = False
     else:
         # 不再需要導入 OllamaLLM，因為我們使用 Deep_Agentic_AI_Tool 的統一 LLM 系統（get_llm()）
         # from src.llm_integration import OllamaLLM
         LEARN_RAG_AVAILABLE = True
+        print("✓ 成功導入 RAG 模組（本地集成版本，包含進階 RAG 方法）")
 except ImportError as e:
     error_msg = str(e)
+    print(f"⚠️ 無法導入 RAG 模組: {error_msg}")
+    print(f"\n💡 請安裝 RAG 系統所需的依賴:")
     print(f"   pip install arxiv langchain-community langchain-text-splitters chromadb sentence-transformers rank-bm25 pypdf docx2txt langchain-experimental")
     print(f"\n   或者:")
+    print(f"   cd {deep_agent_root}")
     print(f"   uv sync")
     LEARN_RAG_AVAILABLE = False
 except Exception as e:
     error_msg = str(e)
+    print(f"⚠️ 導入 RAG 模組時發生錯誤: {error_msg}")
     print(f"   當前 Python 路徑: {sys.path[:3]}")
+    print(f"   項目根目錄: {deep_agent_root}")
+    print(f"   src 目錄: {src_path}")
     LEARN_RAG_AVAILABLE = False
     """重置全局實例"""
     global _private_rag_instance
     _private_rag_instance = None
     """
     私有文件 RAG 系統管理器

src/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+RAG 系統模組套件
+"""
+from .document_processor import DocumentProcessor
+from .retrievers import (
+    BaseRetriever,
+    BM25Retriever,
+    VectorRetriever,
+    HybridSearch,
+    Reranker,
+    RAGPipeline,
+)
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+from .subquery_rag import SubQueryDecompositionRAG
+from .hyde_rag import HyDERAG
+from .hybrid_subquery_hyde_rag import HybridSubqueryHyDERAG
+from .step_back_rag import StepBackRAG
+from .triple_hybrid_rag import TripleHybridRAG
+__all__ = [
+    "DocumentProcessor",
+    "BaseRetriever",
+    "BM25Retriever",
+    "VectorRetriever",
+    "HybridSearch",
+    "Reranker",
+    "RAGPipeline",
+    "PromptFormatter",
+    "OllamaLLM",
+    "SubQueryDecompositionRAG",
+    "HyDERAG",
+    "HybridSubqueryHyDERAG",
+    "StepBackRAG",
+    "TripleHybridRAG",
+]

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,590 @@

+"""
+文檔處理模組：載入 arXiv 論文並進行文字分割
+支援本地檔案：PDF, DOCX, TXT
+支援兩種分塊策略：
+1. 字符分塊（預設）：基於固定字符數的分塊，速度快
+2. 語義分塊（可選）：基於語義相似度的分塊，能保持語義完整性
+"""
+from typing import List, Dict, Optional, Any
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from pathlib import Path
+import os
+import arxiv
+import re
+# 嘗試導入語義分塊器（需要 langchain-experimental）
+try:
+    from langchain_experimental.text_splitter import SemanticChunker
+    SEMANTIC_CHUNKER_AVAILABLE = True
+except ImportError:
+    SEMANTIC_CHUNKER_AVAILABLE = False
+class DocumentProcessor:
+    """
+    處理 arXiv 論文文檔，進行分割和準備
+    支援兩種分塊模式：
+    - 字符分塊（預設）：快速、穩定，適合大多數場景
+    - 語義分塊（可選）：更智能，能保持語義完整性，但需要額外依賴和計算時間
+    """
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        embeddings: Optional[Any] = None,  # 可選：用於語義分塊的 embedding 模型
+        use_semantic_chunking: bool = False,  # 是否使用語義分塊
+        breakpoint_threshold_amount: float = 1.5,  # 語義分塊敏感度（標準差倍數）
+        min_chunk_size: int = 100  # 語義分塊的最小 chunk 大小（字符數）
+    ):
+        """
+        初始化文檔處理器
+        Args:
+            chunk_size: 每個 chunk 的大小（字符數），僅用於字符分塊模式
+            chunk_overlap: chunk 之間的重疊大小（字符數），僅用於字符分塊模式
+            embeddings: 用於計算語義距離的 embedding 模型物件（可選）
+                       當 use_semantic_chunking=True 時必須提供
+            use_semantic_chunking: 是否使用語義分塊
+                                  True: 使用語義分塊（需要提供 embeddings）
+                                  False: 使用字符分塊（預設）
+            breakpoint_threshold_amount: 語義分塊的敏感度參數
+                                        數值越大，分塊越少（chunks 越大）
+                                        數值越小，分塊越多（chunks 越小）
+                                        建議範圍：1.0 - 2.0，預設 1.5
+            min_chunk_size: 語義分塊的最小 chunk 大小（字符數）
+                           小於此大小的 chunks 會被合併到相鄰的 chunks
+                           預設 100 字符
+        """
+        self.embeddings = embeddings
+        self.use_semantic_chunking = use_semantic_chunking
+        self.min_chunk_size = min_chunk_size
+        # 如果要求使用語義分塊
+        if use_semantic_chunking:
+            # 檢查是否安裝了必要的依賴
+            if not SEMANTIC_CHUNKER_AVAILABLE:
+                raise ImportError(
+                    "使用語義分塊需要安裝 langchain-experimental 套件。\n"
+                    "請執行: pip install langchain-experimental\n"
+                    "或使用字符分塊模式（use_semantic_chunking=False）"
+                )
+            # 檢查是否提供了 embeddings
+            if embeddings is None:
+                raise ValueError(
+                    "使用語義分塊時必須提供 embeddings 參數。\n"
+                    "範例：\n"
+                    "  from langchain_community.embeddings import HuggingFaceEmbeddings\n"
+                    "  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')\n"
+                    "  processor = DocumentProcessor(embeddings=embeddings, use_semantic_chunking=True)"
+                )
+            # 初始化語義分塊器
+            # 使用「標準差」策略：當相鄰句子之間的語義距離超過平均距離的標準差倍數時，進行切分
+            self.text_splitter = SemanticChunker(
+                embeddings,
+                breakpoint_threshold_type="standard_deviation",
+                breakpoint_threshold_amount=breakpoint_threshold_amount
+            )
+            print(f"✓ 使用語義分塊模式（敏感度: {breakpoint_threshold_amount}，最小 chunk 大小: {min_chunk_size} 字符）")
+        else:
+            # 使用傳統的字符分塊（預設模式）
+            self.text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+            )
+            print(f"✓ 使用字符分塊模式（大小: {chunk_size} 字符，重疊: {chunk_overlap} 字符）")
+    def _post_process_chunks(self, chunks: List[str]) -> List[str]:
+        """
+        後處理 chunks：過濾和合併太小的 chunks
+        語義分塊可能會產生一些非��小的 chunks（例如只有幾個單詞），
+        這些小 chunks 可能不包含足夠的上下文資訊。此方法會：
+        1. 將小於 min_chunk_size 的 chunks 合併到相鄰的 chunks
+        2. 確保最終的 chunks 都有足夠的大小
+        Args:
+            chunks: 原始 chunks 列表（從分塊器產生的）
+        Returns:
+            處理後的 chunks 列表（過濾和合併後的）
+        """
+        # 如果使用字符分塊，不需要後處理（因為已經有固定大小）
+        if not self.use_semantic_chunking:
+            return chunks
+        # 如果沒有 chunks，直接返回
+        if not chunks:
+            return chunks
+        processed = []
+        current_small_chunk = ""  # 累積的小 chunk
+        for chunk in chunks:
+            chunk_stripped = chunk.strip()
+            chunk_length = len(chunk_stripped)
+            # 如果當前 chunk 太小，嘗試與下一個合併
+            if chunk_length < self.min_chunk_size:
+                # 累積到臨時變數中
+                if current_small_chunk:
+                    current_small_chunk += "\n\n" + chunk
+                else:
+                    current_small_chunk = chunk
+            else:
+                # 當前 chunk 足夠大
+                # 如果有累積的小 chunk，先處理它
+                if current_small_chunk:
+                    current_small_chunk_stripped = current_small_chunk.strip()
+                    if len(current_small_chunk_stripped) >= self.min_chunk_size:
+                        # 累積後足夠大，作為獨立 chunk
+                        processed.append(current_small_chunk)
+                    else:
+                        # 累積後還是太小，合併到上一個 chunk（如果存在）
+                        if processed:
+                            processed[-1] += "\n\n" + current_small_chunk
+                        else:
+                            # 如果沒有上一個 chunk，還是要保留
+                            processed.append(current_small_chunk)
+                    current_small_chunk = ""
+                # 添加當前足夠大的 chunk
+                processed.append(chunk)
+        # 處理最後的累積小 chunk
+        if current_small_chunk:
+            current_small_chunk_stripped = current_small_chunk.strip()
+            if len(current_small_chunk_stripped) >= self.min_chunk_size:
+                # 足夠大，作為獨立 chunk
+                processed.append(current_small_chunk)
+            elif processed:
+                # 太小，合併到最後一個 chunk
+                processed[-1] += "\n\n" + current_small_chunk
+            else:
+                # 如果沒有其他 chunks，還是要保留
+                processed.append(current_small_chunk)
+        return processed
+    def fetch_papers(self, query: str, max_results: int = 10) -> List[Dict]:
+        """
+        從 arXiv 獲取論文
+        Args:
+            query: 搜尋查詢（例如 "cat:cs.AI"）
+            max_results: 最大結果數量
+        Returns:
+            論文列表，每個論文包含標題、摘要等資訊
+        """
+        search = arxiv.Search(
+            query=query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.SubmittedDate
+        )
+        papers = []
+        for paper in search.results():
+            papers.append({
+                "title": paper.title,
+                "authors": [author.name for author in paper.authors],
+                "summary": paper.summary,
+                "published": str(paper.published),
+                "arxiv_id": paper.entry_id.split('/')[-1],
+                "arxiv_url": paper.entry_id,
+                "pdf_url": paper.pdf_url,
+                "categories": paper.categories,
+            })
+        return papers
+    def process_documents(self, papers: List[Dict]) -> List[Dict]:
+        """
+        處理論文，將每篇論文分割成 chunks
+        Args:
+            papers: 論文列表
+        Returns:
+            處理後的文檔 chunks，每個 chunk 包含內容和元數據
+        """
+        documents = []
+        for paper in papers:
+            # 組合論文的完整文字（標題 + 摘要）
+            # 保留換行符號 \n\n 作為語義斷點的結構參考
+            full_text = f"Title: {paper['title']}\n\nAbstract: {paper['summary']}"
+            # 分割文字（根據選擇的模式：字符分塊或語義分塊）
+            chunks = self.text_splitter.split_text(full_text)
+            # 後處理：過濾和合併太小的 chunks（僅語義分塊模式）
+            chunks = self._post_process_chunks(chunks)
+            # 為每個 chunk 創建文檔物件
+            for i, chunk in enumerate(chunks):
+                doc = {
+                    "content": chunk,
+                    "metadata": {
+                        "title": paper['title'],
+                        "arxiv_id": paper['arxiv_id'],
+                        "arxiv_url": paper['arxiv_url'],
+                        "pdf_url": paper['pdf_url'],
+                        "authors": paper['authors'],
+                        "published": paper['published'],
+                        "categories": paper['categories'],
+                        "chunk_index": i,
+                        "total_chunks": len(chunks),
+                        "chunking_method": "semantic" if self.use_semantic_chunking else "character"
+                    }
+                }
+                documents.append(doc)
+        return documents
+    def get_texts_and_metadatas(self, documents: List[Dict]):
+        """
+        從文檔列表中提取文字和元數據
+        Args:
+            documents: 文檔列表
+        Returns:
+            (texts, metadatas) 元組
+        """
+        texts = [doc["content"] for doc in documents]
+        metadatas = [doc["metadata"] for doc in documents]
+        return texts, metadatas
+    @staticmethod
+    def clean_extracted_text(text: str) -> str:
+        """
+        清理從 PDF/DOCX 提取的文本，移除多餘的空格和修復字符換行問題
+        某些 PDF 提取工具會在每個字符之間插入空格或換行，特別是中文文本。
+        此方法會：
+        1. 修復「每個字符一行」的問題（將單字符行合併）
+        2. 移除中文字符之間的多餘空格
+        3. 保留英文單詞之間的空格
+        4. 保留標點符號周圍的適當空格
+        5. 保留真正的段落分隔
+        Args:
+            text: 原始提取的文本
+        Returns:
+            清理後的文本
+        """
+        if not text:
+            return text
+        # 步驟 0: 修復「每個字符一行」的問題
+        # 檢測模式：每行只有一個字符（可能是中文字符、標點、或單個字母/數字）
+        # 將這些單字符行合併成連續文本
+        lines = text.split('\n')
+        merged_lines = []
+        i = 0
+        def is_single_char_line(line: str) -> bool:
+            """
+            判斷是否為單字符行
+            考慮：去除空格後長度 <= 3（可能是單字符+標點，或單字符+空格）
+            """
+            stripped = line.strip()
+            if not stripped:
+                return False  # 空行不算
+            # 如果去除空格後長度 <= 3，且主要是中文字符、標點或單個字母/數字
+            if len(stripped) <= 3:
+                # 檢查是否主要是單個字符（可能帶標點或空格）
+                # 移除所有空格後，如果長度 <= 2，認為是單字符行
+                no_space = stripped.replace(' ', '')
+                if len(no_space) <= 2:
+                    return True
+            return False
+        while i < len(lines):
+            line = lines[i]
+            stripped_line = line.strip()
+            # 如果當前行是單字符行
+            if is_single_char_line(line):
+                # 收集連續的單字符行（包括空行，因為空行可能是分隔符）
+                merged_chars = []
+                j = i
+                consecutive_single_chars = 0
+                while j < len(lines):
+                    current_line = lines[j]
+                    current_stripped = current_line.strip()
+                    if is_single_char_line(current_line):
+                        # 是單字符行，收集字符（去除空格）
+                        char = current_stripped.replace(' ', '')
+                        if char:
+                            merged_chars.append(char)
+                        consecutive_single_chars += 1
+                        j += 1
+                    elif not current_stripped:
+                        # 空行：如果前面有單字符，且後面可能還有單字符，跳過空行
+                        # 檢查下一行是否也是單字符
+                        if j + 1 < len(lines) and is_single_char_line(lines[j + 1]):
+                            # 空行後面還有單字符，跳過空行繼續收集
+                            j += 1
+                        else:
+                            # 空行後面沒有單字符了，停止收集
+                            break
+                    else:
+                        # 遇到正常行，停止收集
+                        break
+                # 如果收集到多個單字符，合併它們
+                if len(merged_chars) > 1:
+                    merged_text = ''.join(merged_chars)
+                    merged_lines.append(merged_text)
+                    i = j
+                    continue
+                elif len(merged_chars) == 1 and consecutive_single_chars > 1:
+                    # 只有一個字符但有多行（可能是空格導致的），也合併
+                    merged_text = ''.join(merged_chars)
+                    merged_lines.append(merged_text)
+                    i = j
+                    continue
+                else:
+                    # 只有一個單字符，且確實只有一行，保留原樣
+                    if merged_chars:
+                        merged_lines.append(merged_chars[0])
+                    i = j
+                    continue
+            else:
+                # 正常行，直接添加
+                if stripped_line:  # 非空行
+                    merged_lines.append(stripped_line)
+                i += 1
+        # 重新組合文本
+        text = '\n'.join(merged_lines)
+        # 步驟 0.5: 再次處理可能的殘留問題
+        # 如果還有單字符行（可能是第一次處理遺漏的），再次處理
+        lines = text.split('\n')
+        final_lines = []
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            if is_single_char_line(line):
+                # 再次收集連續的單字符行
+                merged_chars = []
+                j = i
+                while j < len(lines) and is_single_char_line(lines[j]):
+                    char = lines[j].strip().replace(' ', '')
+                    if char:
+                        merged_chars.append(char)
+                    j += 1
+                if len(merged_chars) > 1:
+                    final_lines.append(''.join(merged_chars))
+                    i = j
+                else:
+                    if merged_chars:
+                        final_lines.append(merged_chars[0])
+                    i = j
+            else:
+                if line:
+                    final_lines.append(line)
+                i += 1
+        text = '\n'.join(final_lines)
+        # 1. 移除中文字符之間的空格
+        # 匹配模式：中文字符 + 空格 + 中文字符
+        chinese_char_pattern = r'([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])\s+([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])'
+        text = re.sub(chinese_char_pattern, r'\1\2', text)
+        # 2. 移除中文和標點符號之間的多餘空格
+        # 中文 + 空格 + 標點符號
+        chinese_punct_pattern = r'([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])\s+([，。、；：！？""''（）【】《》])'
+        text = re.sub(chinese_punct_pattern, r'\1\2', text)
+        # 標點符號 + 空格 + 中文
+        # 使用 re.escape 來正確處理標點符號，避免轉義序列警告
+        punct_chars = '，。、；：！？""''（）【】《》'
+        punct_chinese_pattern = f'([{re.escape(punct_chars)}])\\s+([\\u4e00-\\u9fff\\u3400-\\u4dbf\\uf900-\\ufaff])'
+        text = re.sub(punct_chinese_pattern, r'\1\2', text)
+        # 3. 移除數字和中文之間的多餘空格（例如："500  公里" -> "500公里"）
+        number_chinese_pattern = r'(\d+)\s+([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])'
+        text = re.sub(number_chinese_pattern, r'\1\2', text)
+        chinese_number_pattern = r'([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])\s+(\d+)'
+        text = re.sub(chinese_number_pattern, r'\1\2', text)
+        # 4. 移除英文單詞內部的多餘空格（例如："Nebula-X 跨次 元量" -> "Nebula-X 跨次元量"）
+        # 但保留英文單詞之間的空格
+        # 匹配：非空格字符 + 空格 + 非空格字符（如果其中一個是中文，則移除空格）
+        mixed_space_pattern = r'([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])\s+([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])'
+        text = re.sub(mixed_space_pattern, r'\1\2', text)
+        # 5. 移除多個連續空格（保留單個空格，用於英文單詞之間）
+        text = re.sub(r' +', ' ', text)
+        # 6. 清理行首行尾的空格（但保留換行符）
+        lines = text.split('\n')
+        cleaned_lines = [line.strip() for line in lines]
+        text = '\n'.join(cleaned_lines)
+        # 7. 移除多個連續的換行符（保留最多兩個，用於段落分隔）
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # 8. 修復可能的殘留問題：移除中文字符之間殘留的空格
+        # 再次檢查並移除中文字符之間的空格（處理可能遺漏的情況）
+        text = re.sub(r'([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])\s+([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])', r'\1\2', text)
+        return text
+    def load_from_file(self, file_path: str) -> Dict:
+        """
+        從本地檔案載入文檔（支援 PDF, DOCX, TXT 等）
+        Args:
+            file_path: 檔案路徑
+        Returns:
+            文檔字典，包含內容和元數據
+        """
+        file_path = Path(file_path)
+        if not file_path.exists():
+            raise FileNotFoundError(f"檔案不存在: {file_path}")
+        file_ext = file_path.suffix.lower()
+        file_name = file_path.stem
+        file_size = os.path.getsize(file_path)
+        # 根據檔案類型選擇不同的加載器
+        if file_ext == '.pdf':
+            try:
+                from langchain_community.document_loaders import PyPDFLoader
+                loader = PyPDFLoader(str(file_path))
+                pages = loader.load()
+                # 合併所有頁面
+                full_text = "\n\n".join([page.page_content for page in pages])
+                # 清理提取的文本（移除多餘空格）
+                full_text = self.clean_extracted_text(full_text)
+            except ImportError:
+                raise ImportError(
+                    "需要安裝 pypdf 來處理 PDF 檔案: pip install pypdf"
+                )
+        elif file_ext in ['.docx', '.doc']:
+            try:
+                from langchain_community.document_loaders import Docx2txtLoader
+                loader = Docx2txtLoader(str(file_path))
+                pages = loader.load()
+                full_text = "\n\n".join([page.page_content for page in pages])
+                # 清理提取的文本（移除多餘空格）
+                full_text = self.clean_extracted_text(full_text)
+            except ImportError:
+                raise ImportError(
+                    "需要安裝 docx2txt 來處理 DOCX 檔案: pip install docx2txt"
+                )
+        elif file_ext == '.txt':
+            # 嘗試不同的編碼
+            encodings = ['utf-8', 'gbk', 'big5', 'latin-1']
+            full_text = None
+            for encoding in encodings:
+                try:
+                    with open(file_path, 'r', encoding=encoding) as f:
+                        full_text = f.read()
+                    break
+                except UnicodeDecodeError:
+                    continue
+            if full_text is None:
+                raise ValueError(f"無法讀取檔案，嘗試的編碼都不適用: {encodings}")
+        else:
+            raise ValueError(
+                f"不支援的檔案類型: {file_ext}\n"
+                f"支援的格式: .pdf, .docx, .doc, .txt"
+            )
+        if not full_text or len(full_text.strip()) == 0:
+            raise ValueError(f"檔案為空或無法提取文字: {file_path}")
+        return {
+            "title": file_name,
+            "content": full_text,
+            "file_path": str(file_path),
+            "file_type": file_ext,
+            "file_size": file_size,
+        }
+    def process_file(self, file_path: str) -> List[Dict]:
+        """
+        處理單個檔案，分割成 chunks
+        Args:
+            file_path: 檔案路徑
+        Returns:
+            處理後的文檔 chunks 列表
+        """
+        # 載入檔案
+        file_doc = self.load_from_file(file_path)
+        # 分割文字（根據選擇的模式：字符分塊或語義分塊）
+        chunks = self.text_splitter.split_text(file_doc["content"])
+        # 後處理：過濾和合併太小的 chunks（僅語義分塊模式）
+        chunks = self._post_process_chunks(chunks)
+        if not chunks:
+            raise ValueError(f"檔案分割後沒有內容: {file_path}")
+        # 創建文檔 chunks
+        documents = []
+        for i, chunk in enumerate(chunks):
+            doc = {
+                "content": chunk,
+                "metadata": {
+                    "title": file_doc["title"],
+                    "file_path": file_doc["file_path"],
+                    "file_type": file_doc["file_type"],
+                    "file_size": file_doc["file_size"],
+                    "chunk_index": i,
+                    "total_chunks": len(chunks),
+                    "chunking_method": "semantic" if self.use_semantic_chunking else "character"
+                }
+            }
+            documents.append(doc)
+        return documents
+    def process_files(self, file_paths: List[str]) -> List[Dict]:
+        """
+        處理多個檔案
+        Args:
+            file_paths: 檔案路徑列表
+        Returns:
+            所有檔案的文檔 chunks 列表
+        """
+        all_documents = []
+        for file_path in file_paths:
+            try:
+                print(f"處理檔案: {file_path}")
+                documents = self.process_file(file_path)
+                all_documents.extend(documents)
+                print(f"  ✓ 創建了 {len(documents)} 個 chunks")
+            except Exception as e:
+                print(f"  ✗ 處理檔案失敗: {file_path}")
+                print(f"    錯誤: {e}")
+                continue
+        return all_documents

src/hybrid_subquery_hyde_rag.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+Hybrid Sub-query + HyDE RAG：融合 Sub-query Decomposition 和 HyDE
+結合兩種方法的優勢，提升檢索精度
+"""
+from typing import List, Dict, Optional
+from .retrievers.reranker import RAGPipeline
+from .retrievers.vector_retriever import VectorRetriever
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+import hashlib
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logger = logging.getLogger(__name__)
+class HybridSubqueryHyDERAG:
+    """融合 Sub-query Decomposition 和 HyDE 的 RAG 系統"""
+    def __init__(
+        self,
+        rag_pipeline: RAGPipeline,
+        vector_retriever: VectorRetriever,
+        llm: OllamaLLM,
+        max_sub_queries: int = 3,
+        top_k_per_subquery: int = 5,
+        hypothetical_length: int = 200,
+        temperature_subquery: float = 0.3,
+        temperature_hyde: float = 0.7,
+        enable_parallel: bool = True
+    ):
+        """
+        初始化融合 RAG
+        Args:
+            rag_pipeline: RAG 管線實例
+            vector_retriever: 向量檢索器
+            llm: LLM 實例
+            max_sub_queries: 最多生成的子問題數量
+            top_k_per_subquery: 每個子問題檢索的結果數量
+            hypothetical_length: 假設性文檔目標長度（字符數）
+            temperature_subquery: 生成子問題的溫度（較低，更穩定）
+            temperature_hyde: 生成假設性文檔的溫度（較高，更多專業術語）
+            enable_parallel: 是否並行處理
+        """
+        self.rag_pipeline = rag_pipeline
+        self.vector_retriever = vector_retriever
+        self.llm = llm
+        self.max_sub_queries = max_sub_queries
+        self.top_k_per_subquery = top_k_per_subquery
+        self.hypothetical_length = hypothetical_length
+        self.temperature_subquery = temperature_subquery
+        self.temperature_hyde = temperature_hyde
+        self.enable_parallel = enable_parallel
+    def _generate_sub_queries(self, question: str) -> List[str]:
+        """
+        生成子問題（與 SubQueryDecompositionRAG 相同）
+        Args:
+            question: 原始問題
+        Returns:
+            子問題列表
+        """
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""你是一個專業助理。請將以下原始問題拆解成最多 {self.max_sub_queries} 個具體的子問題，以便進行資料搜尋。
+每個子問題應專注於原始問題的一個特定面向。請以換行符號分隔問題。
+原始問題: {question}
+子問題清單:"""
+        else:
+            prompt = f"""You are a professional assistant. Please decompose the following original question into at most {self.max_sub_queries} specific sub-questions for information retrieval.
+Each sub-question should focus on a specific aspect of the original question. Please separate questions with newlines.
+Original question: {question}
+Sub-question list:"""
+        try:
+            response = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature_subquery,
+                max_tokens=500
+            )
+            sub_queries = [
+                q.strip()
+                for q in response.strip().split("\n")
+                if q.strip() and not q.strip().startswith("#")
+            ]
+            # 移除編號前綴（如 "1. ", "1) " 等）
+            cleaned_queries = []
+            for q in sub_queries:
+                q = q.lstrip("0123456789. )")
+                q = q.strip()
+                if q:
+                    cleaned_queries.append(q)
+            cleaned_queries = cleaned_queries[:self.max_sub_queries]
+            if not cleaned_queries:
+                logger.warning("⚠️  未生成子問題，使用原始問題")
+                cleaned_queries = [question]
+            return cleaned_queries
+        except Exception as e:
+            logger.error(f"⚠️  生成子問題時出錯: {e}")
+            return [question]
+    def _generate_hypothetical_document(self, sub_query: str) -> str:
+        """
+        為子問題生成假設性文檔（與 HyDERAG 相同）
+        Args:
+            sub_query: 子問題
+        Returns:
+            假設性文檔文本
+        """
+        is_chinese = PromptFormatter.detect_language(sub_query) == "zh"
+        if is_chinese:
+            prompt = f"""請針對以下問題，寫出一段約 {self.hypothetical_length} 字的專業技術檔案內容。
+這段內容應包含該領域常見的專業術語與原理說明，以便用於後續的語義檢索。
+請使用專業的術語和概念，即使你對某些細節不確定，也要包含相關的專業詞彙。
+問題: {sub_query}
+專業技術內容："""
+        else:
+            prompt = f"""Please write a professional technical document of approximately {self.hypothetical_length} words in response to the following question.
+This content should include common professional terminology and principle explanations in this field, to be used for subsequent semantic retrieval.
+Please use professional terms and concepts, and include relevant professional vocabulary even if you are uncertain about some details.
+Question: {sub_query}
+Professional technical content:"""
+        try:
+            hypothetical_doc = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature_hyde,
+                max_tokens=500
+            )
+            hypothetical_doc = hypothetical_doc.strip()
+            if not hypothetical_doc:
+                logger.warning(f"⚠️  子問題 '{sub_query}' 的假設性文檔為空，使用子問題本身")
+                return sub_query
+            logger.debug(f"✅ 為子問題生成假設性文檔（長度: {len(hypothetical_doc)} 字符）")
+            return hypothetical_doc
+        except Exception as e:
+            logger.error(f"⚠️  生成假設性文檔時出錯: {e}")
+            return sub_query
+    def _get_doc_id(self, doc: Dict) -> str:
+        """
+        生成文檔的唯一標識符
+        Args:
+            doc: 文檔字典
+        Returns:
+            唯一 ID
+        """
+        metadata = doc.get("metadata", {})
+        content = doc.get("content", "")
+        if "arxiv_id" in metadata and "chunk_index" in metadata:
+            return f"{metadata['arxiv_id']}_{metadata['chunk_index']}"
+        elif "file_path" in metadata and "chunk_index" in metadata:
+            return f"{metadata['file_path']}_{metadata['chunk_index']}"
+        else:
+            content_hash = hashlib.md5(content.encode()).hexdigest()[:16]
+            return f"doc_{content_hash}"
+    def _process_subquery_with_hyde(
+        self,
+        sub_query: str,
+        metadata_filter: Optional[Dict] = None
+    ) -> tuple:
+        """
+        處理單個子問題：生成假設性文檔並檢索
+        Args:
+            sub_query: 子問題
+            metadata_filter: 可選的 metadata 過濾條件
+        Returns:
+            (檢索結果列表, 假設性文檔)
+        """
+        try:
+            # 生成假設性文檔
+            hypothetical_doc = self._generate_hypothetical_document(sub_query)
+            # 使用假設性文檔檢索
+            results = self.vector_retriever.retrieve(
+                query=hypothetical_doc,  # 使用假設性文檔而不是子問題
+                top_k=self.top_k_per_subquery,
+                metadata_filter=metadata_filter
+            )
+            return results, hypothetical_doc
+        except Exception as e:
+            logger.error(f"⚠️  處理子問題 '{sub_query}' 時出錯: {e}")
+            return [], ""
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        return_sub_queries: bool = False,
+        return_hypothetical: bool = False
+    ) -> Dict:
+        """
+        執行融合 RAG 檢索（不生成答案）
+        Args:
+            question: 原始問題
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件
+            return_sub_queries: 是否返回子問題列表
+            return_hypothetical: 是否返回假設性文檔字典（子問題 -> 假設性文檔）
+        Returns:
+            包含檢索結果和統計資訊的字典
+        """
+        start_time = time.time()
+        # 第一步：生成子問題
+        logger.info(f"🔍 拆解問題: '{question}'")
+        sub_queries = self._generate_sub_queries(question)
+        logger.info(f"✅ 生成 {len(sub_queries)} 個子問題")
+        # 第二步：為每個子問題生成假設性文檔並檢索
+        logger.info(f"📚 為每個子問題生成假設性文檔並檢索...")
+        unique_docs = {}
+        hypothetical_docs = {}
+        if self.enable_parallel and len(sub_queries) > 1:
+            # 並行處理
+            logger.info(f"🔄 並行處理 {len(sub_queries)} 個子問題...")
+            with ThreadPoolExecutor(max_workers=min(len(sub_queries), 5)) as executor:
+                future_to_query = {
+                    executor.submit(self._process_subquery_with_hyde, sq, metadata_filter): sq
+                    for sq in sub_queries
+                }
+                for future in as_completed(future_to_query):
+                    sub_query = future_to_query[future]
+                    try:
+                        results, hypo_doc = future.result()
+                        hypothetical_docs[sub_query] = hypo_doc
+                        logger.debug(f"✅ 子問題 '{sub_query}' 找到 {len(results)} 個結果")
+                        for doc in results:
+                            doc_id = self._get_doc_id(doc)
+                            if doc_id not in unique_docs:
+                                unique_docs[doc_id] = doc
+                            else:
+                                # 保留分數更高的
+                                existing_score = unique_docs[doc_id].get('score', 0)
+                                new_score = doc.get('score', 0)
+                                if new_score > existing_score:
+                                    unique_docs[doc_id] = doc
+                    except Exception as e:
+                        logger.error(f"⚠️  處理子問題 '{sub_query}' 時出錯: {e}")
+        else:
+            # 串行處理
+            logger.info(f"🔄 串行處理 {len(sub_queries)} 個子問題...")
+            for sub_query in sub_queries:
+                results, hypo_doc = self._process_subquery_with_hyde(sub_query, metadata_filter)
+                hypothetical_docs[sub_query] = hypo_doc
+                logger.debug(f"✅ 子問題 '{sub_query}' 找到 {len(results)} 個結果")
+                for doc in results:
+                    doc_id = self._get_doc_id(doc)
+                    if doc_id not in unique_docs:
+                        unique_docs[doc_id] = doc
+                    else:
+                        existing_score = unique_docs[doc_id].get('score', 0)
+                        new_score = doc.get('score', 0)
+                        if new_score > existing_score:
+                            unique_docs[doc_id] = doc
+        # 第三步：排序並返回前 top_k
+        result_list = list(unique_docs.values())
+        result_list.sort(key=lambda x: x.get('score', 0), reverse=True)
+        final_results = result_list[:top_k]
+        elapsed_time = time.time() - start_time
+        logger.info(f"✅ 找到 {len(final_results)} 個唯一文檔（去重後，總共 {len(result_list)} 個）")
+        return {
+            "results": final_results,
+            "total_docs_found": len(result_list),
+            "sub_queries": sub_queries if return_sub_queries else None,
+            "hypothetical_documents": hypothetical_docs if return_hypothetical else None,
+            "elapsed_time": elapsed_time
+        }
+    def generate_answer(
+        self,
+        question: str,
+        formatter: PromptFormatter,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        document_type: str = "general",
+        return_sub_queries: bool = False,
+        return_hypothetical: bool = False
+    ) -> Dict:
+        """
+        完整的融合 RAG 流程：檢索 + 生成答案
+        Args:
+            question: 原始問題
+            formatter: Prompt 格式化器
+            top_k: 用於生成答案的文檔數量
+            metadata_filter: 可選的 metadata 過濾條件
+            document_type: 文檔類型 ("paper", "cv", "general")
+            return_sub_queries: 是否返回子問題列表
+            return_hypothetical: 是否返回假設性文檔字典
+        Returns:
+            包含檢索結果、生成的答案和統計資訊的字典
+        """
+        start_time = time.time()
+        # 檢索
+        retrieval_result = self.query(
+            question=question,
+            top_k=top_k,
+            metadata_filter=metadata_filter,
+            return_sub_queries=return_sub_queries,
+            return_hypothetical=return_hypothetical
+        )
+        if not retrieval_result["results"]:
+            return {
+                **retrieval_result,
+                "answer": "抱歉，未找到相關文檔來回答此問題。",
+                "formatted_context": None,
+                "answer_time": 0.0,
+                "total_time": retrieval_result["elapsed_time"]
+            }
+        # 格式化上下文
+        formatted_context = formatter.format_context(
+            retrieval_result["results"],
+            document_type=document_type
+        )
+        # 創建 prompt（使用原始問題）
+        prompt = formatter.create_prompt(
+            question,
+            formatted_context,
+            document_type=document_type
+        )
+        # 生成回答
+        logger.info("🤖 生成回答中...")
+        answer_start = time.time()
+        try:
+            answer = self.llm.generate(
+                prompt=prompt,
+                temperature=0.7,
+                max_tokens=2048
+            )
+            answer_time = time.time() - answer_start
+            logger.info(f"✅ 回答生成完成（耗時: {answer_time:.2f}s）")
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            answer = f"生成回答時出錯: {e}"
+            answer_time = time.time() - answer_start
+        total_time = time.time() - start_time
+        return {
+            **retrieval_result,
+            "answer": answer,
+            "formatted_context": formatted_context,
+            "answer_time": answer_time,
+            "total_time": total_time
+        }

src/hyde_rag.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+HyDE (Hypothetical Document Embeddings) RAG：使用假設性文檔改善檢索
+"""
+from typing import List, Dict, Optional
+from .retrievers.reranker import RAGPipeline
+from .retrievers.vector_retriever import VectorRetriever
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+import time
+import logging
+logger = logging.getLogger(__name__)
+class HyDERAG:
+    """使用 HyDE (Hypothetical Document Embeddings) 的 RAG 系統"""
+    def __init__(
+        self,
+        rag_pipeline: RAGPipeline,
+        vector_retriever: VectorRetriever,
+        llm: OllamaLLM,
+        hypothetical_length: int = 200,
+        temperature: float = 0.7
+    ):
+        """
+        初始化 HyDE RAG
+        Args:
+            rag_pipeline: RAG 管線實例（用於最終答案生成）
+            vector_retriever: 向量檢索器（用於基於假設性文檔的檢索）
+            llm: LLM 實例（用於生成假設性文檔）
+            hypothetical_length: 假設性文檔的目標長度（字符數）
+            temperature: 生成假設性文檔時的溫度參數（建議 0.7，以獲得更多專業術語）
+        """
+        self.rag_pipeline = rag_pipeline
+        self.vector_retriever = vector_retriever
+        self.llm = llm
+        self.hypothetical_length = hypothetical_length
+        self.temperature = temperature
+    def _generate_hypothetical_document(self, question: str) -> str:
+        """
+        生成假設性文檔（Hypothetical Document）
+        Args:
+            question: 用戶問題
+        Returns:
+            假設性文檔文本
+        """
+        # 檢測語言
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""請針對以下問題，寫出一段約 {self.hypothetical_length} 字的專業技術檔案內容。
+這段內容應包含該領域常見的專業術語與原理說明，以便用於後續的語義檢索。
+請使用專業的術語和概念，即使你對某些細節不確定，也要包含相關的專業詞彙。
+問題: {question}
+專業技術內容："""
+        else:
+            prompt = f"""Please write a professional technical document of approximately {self.hypothetical_length} words in response to the following question.
+This content should include common professional terminology and principle explanations in this field, to be used for subsequent semantic retrieval.
+Please use professional terms and concepts, and include relevant professional vocabulary even if you are uncertain about some details.
+Question: {question}
+Professional technical content:"""
+        try:
+            hypothetical_doc = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature,  # 較高的溫度以獲得更多專業術語
+                max_tokens=500
+            )
+            # 清理輸出
+            hypothetical_doc = hypothetical_doc.strip()
+            if not hypothetical_doc:
+                logger.warning("⚠️  生成的假設性文檔為空，使用原始問題")
+                return question
+            logger.info(f"✅ 生成假設性文檔（長度: {len(hypothetical_doc)} 字符）")
+            return hypothetical_doc
+        except Exception as e:
+            logger.error(f"⚠️  生成假設性文檔時出錯: {e}")
+            # 回退到使用原始問題
+            return question
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        return_hypothetical: bool = False
+    ) -> Dict:
+        """
+        執行 HyDE 檢索（不生成答案）
+        Args:
+            question: 原始問題
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件
+            return_hypothetical: 是否在結果中包含假設性文檔
+        Returns:
+            包含檢索結果和統計資訊的字典
+        """
+        start_time = time.time()
+        # 第一步：生成假設性文檔
+        logger.info(f"🔍 生成假設性文檔: '{question}'")
+        hypothetical_doc = self._generate_hypothetical_document(question)
+        # 第二步：使用假設性文檔進行檢索
+        logger.info(f"📚 使用假設性文檔進行檢索...")
+        results = self.vector_retriever.retrieve(
+            query=hypothetical_doc,  # 使用假設性文檔而不是原始問題
+            top_k=top_k,
+            metadata_filter=metadata_filter
+        )
+        elapsed_time = time.time() - start_time
+        logger.info(f"✅ 找到 {len(results)} 個結果（耗時: {elapsed_time:.2f}s）")
+        result = {
+            "results": results,
+            "total_docs_found": len(results),
+            "hypothetical_document": hypothetical_doc if return_hypothetical else None,
+            "elapsed_time": elapsed_time
+        }
+        return result
+    def generate_answer(
+        self,
+        question: str,
+        formatter: PromptFormatter,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        document_type: str = "general",
+        return_hypothetical: bool = False
+    ) -> Dict:
+        """
+        完整的 HyDE RAG 流程：生成假設性文檔 -> 檢索 -> 生成答案
+        Args:
+            question: 原始問題
+            formatter: Prompt 格式化器
+            top_k: 用於生成答案的文檔數量
+            metadata_filter: 可選的 metadata 過濾條件
+            document_type: 文檔類型 ("paper", "cv", "general")
+            return_hypothetical: 是否在結果中包含假設性文檔
+        Returns:
+            包含檢索結果、生成的答案和統計資訊的字典
+        """
+        start_time = time.time()
+        # 第一步：生成假設性文檔
+        logger.info(f"🔍 生成假設性文檔: '{question}'")
+        hypothetical_start = time.time()
+        hypothetical_doc = self._generate_hypothetical_document(question)
+        hypothetical_time = time.time() - hypothetical_start
+        # 第二步：使用假設性文檔進行檢索
+        logger.info(f"📚 使用假設性文檔進行檢索...")
+        retrieval_start = time.time()
+        results = self.vector_retriever.retrieve(
+            query=hypothetical_doc,  # 使用假設性文檔而不是原始問題
+            top_k=top_k,
+            metadata_filter=metadata_filter
+        )
+        retrieval_time = time.time() - retrieval_start
+        if not results:
+            return {
+                "results": [],
+                "total_docs_found": 0,
+                "hypothetical_document": hypothetical_doc if return_hypothetical else None,
+                "elapsed_time": retrieval_time + hypothetical_time,
+                "answer": "抱歉，未找到相關文檔來回答此問題。",
+                "formatted_context": None,
+                "answer_time": 0.0,
+                "total_time": retrieval_time + hypothetical_time
+            }
+        # 第三步：格式化上下文
+        formatted_context = formatter.format_context(
+            results,
+            document_type=document_type
+        )
+        # 第四步：創建 prompt（使用原始問題，而不是假設性文檔）
+        prompt = formatter.create_prompt(
+            question,  # 使用原始問題生成答案
+            formatted_context,
+            document_type=document_type
+        )
+        # 第五步：生成回答
+        logger.info("🤖 生成回答中...")
+        answer_start = time.time()
+        try:
+            answer = self.llm.generate(
+                prompt=prompt,
+                temperature=0.7,
+                max_tokens=2048
+            )
+            answer_time = time.time() - answer_start
+            logger.info(f"✅ 回答生成完成（耗時: {answer_time:.2f}s）")
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            answer = f"生成回答時出錯: {e}"
+            answer_time = time.time() - answer_start
+        total_time = time.time() - start_time
+        return {
+            "results": results,
+            "total_docs_found": len(results),
+            "hypothetical_document": hypothetical_doc if return_hypothetical else None,
+            "elapsed_time": retrieval_time + hypothetical_time,
+            "hypothetical_time": hypothetical_time,
+            "retrieval_time": retrieval_time,
+            "answer": answer,
+            "formatted_context": formatted_context,
+            "answer_time": answer_time,
+            "total_time": total_time
+        }

src/llm_integration.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+LLM 集成模組：使用 Ollama 進行本地 LLM 推理
+"""
+from typing import Optional, Dict, List
+import logging
+import requests
+import json
+logger = logging.getLogger(__name__)
+class OllamaLLM:
+    """使用 Ollama 進行本地 LLM 推理"""
+    # 適合 16GB MacBook Air 的模型推薦
+    RECOMMENDED_MODELS = {
+        "deepseek-r1:7b": {
+            "name": "deepseek-r1:7b",
+            "description": "DeepSeek R1 7B - 大模型，高質量",
+            "memory_required": "~8GB",
+            "quality": "優秀"
+        },
+        "llama3.2:3b": {
+            "name": "llama3.2:3b",
+            "description": "Meta Llama 3.2 3B - 輕量級，適合 16GB 內存",
+            "memory_required": "~4GB",
+            "quality": "良好"
+        },
+        "llama3.2:1b": {
+            "name": "llama3.2:1b",
+            "description": "Meta Llama 3.2 1B - 極輕量級，快速響應",
+            "memory_required": "~2GB",
+            "quality": "基礎"
+        },
+        "phi3:mini": {
+            "name": "phi3:mini",
+            "description": "Microsoft Phi-3 Mini - 小模型，高質量",
+            "memory_required": "~3GB",
+            "quality": "良好"
+        },
+        "gemma:2b": {
+            "name": "gemma:2b",
+            "description": "Google Gemma 2B - 輕量級，開源",
+            "memory_required": "~3GB",
+            "quality": "良好"
+        },
+        "mistral:7b": {
+            "name": "mistral:7b",
+            "description": "Mistral 7B - 較大但質量高（如果內存足夠）",
+            "memory_required": "~8GB",
+            "quality": "優秀"
+        }
+    }
+    def __init__(
+        self,
+        model_name: str = "llama3.2:3b",
+        base_url: str = "http://localhost:11434",
+        timeout: int = 120
+    ):
+        """
+        初始化 Ollama LLM
+        Args:
+            model_name: Ollama 模型名稱（預設: llama3.2:3b）
+            base_url: Ollama API 基礎 URL
+            timeout: 請求超時時間（秒）
+        """
+        self.model_name = model_name
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+        self.api_url = f"{self.base_url}/api"
+        # 檢查模型是否在推薦列表中
+        if model_name not in self.RECOMMENDED_MODELS:
+            logger.warning(
+                f"⚠️  模型 '{model_name}' 不在推薦列表中。"
+                f"推薦的模型: {', '.join(self.RECOMMENDED_MODELS.keys())}"
+            )
+        logger.info(f"✅ Ollama LLM 初始化完成 (模型: {model_name})")
+    def _check_ollama_connection(self) -> bool:
+        """
+        檢查 Ollama 服務是否可用
+        Returns:
+            是否連接成功
+        """
+        try:
+            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            return response.status_code == 200
+        except Exception as e:
+            logger.error(f"❌ 無法連接到 Ollama: {e}")
+            logger.error(f"   請確保 Ollama 正在運行: ollama serve")
+            return False
+    def _check_model_available(self) -> bool:
+        """
+        檢查模型是否已下載
+        Returns:
+            模型是否可用
+        """
+        try:
+            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            if response.status_code == 200:
+                models = response.json().get('models', [])
+                model_names = [m.get('name', '') for m in models]
+                return any(self.model_name in name for name in model_names)
+            return False
+        except Exception as e:
+            logger.error(f"❌ 檢查模型時出錯: {e}")
+            return False
+    def generate(
+        self,
+        prompt: str,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        stream: bool = False
+    ) -> str:
+        """
+        生成回答
+        Args:
+            prompt: 輸入 prompt
+            temperature: 溫度參數（0.0-1.0），控制隨機性
+            max_tokens: 最大生成 token 數（None 表示使用模型預設）
+            stream: 是否使用流式輸出
+        Returns:
+            生成的回答
+        """
+        # 檢查連接
+        if not self._check_ollama_connection():
+            raise ConnectionError(
+                f"無法連接到 Ollama 服務 ({self.base_url})\n"
+                f"請確保 Ollama 正在運行：\n"
+                f"  1. 安裝 Ollama: https://ollama.ai\n"
+                f"  2. 啟動服務: ollama serve\n"
+                f"  3. 下載模型: ollama pull {self.model_name}"
+            )
+        # 檢查模型
+        if not self._check_model_available():
+            logger.warning(
+                f"⚠️  模型 '{self.model_name}' 可能未下載。"
+                f"請運行: ollama pull {self.model_name}"
+            )
+        # 準備請求參數
+        payload = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "stream": stream,
+            "options": {
+                "temperature": temperature,
+            }
+        }
+        if max_tokens:
+            payload["options"]["num_predict"] = max_tokens
+        try:
+            # 發送請求
+            response = requests.post(
+                f"{self.api_url}/generate",
+                json=payload,
+                timeout=self.timeout,
+                stream=stream
+            )
+            if response.status_code != 200:
+                error_msg = response.text
+                raise RuntimeError(f"Ollama API 錯誤: {error_msg}")
+            if stream:
+                # 流式處理
+                full_response = ""
+                for line in response.iter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            if 'response' in data:
+                                chunk = data['response']
+                                full_response += chunk
+                                print(chunk, end='', flush=True)
+                            if data.get('done', False):
+                                break
+                        except json.JSONDecodeError:
+                            continue
+                print()  # 換行
+                return full_response
+            else:
+                # 非流式處理
+                data = response.json()
+                return data.get('response', '')
+        except requests.exceptions.Timeout:
+            raise TimeoutError(
+                f"請求超時（{self.timeout}秒）。"
+                f"可以嘗試增加 timeout 或使用更小的模型。"
+            )
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError(
+                f"無法連接到 Ollama 服務。"
+                f"請確保 Ollama 正在運行：ollama serve"
+            )
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            raise
+    def list_available_models(self) -> List[str]:
+        """
+        列出本地可用的模型
+        Returns:
+            可用模型名稱列表
+        """
+        try:
+            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            if response.status_code == 200:
+                models = response.json().get('models', [])
+                return [m.get('name', '') for m in models]
+            return []
+        except Exception as e:
+            logger.error(f"❌ 獲取模型列表時出錯: {e}")
+            return []
+    @classmethod
+    def print_recommended_models(cls):
+        """打印推薦的模型列表"""
+        print("\n" + "="*60)
+        print("適合 16GB MacBook Air 的 Ollama 模型推薦")
+        print("="*60)
+        print()
+        for model_key, info in cls.RECOMMENDED_MODELS.items():
+            print(f"📦 {info['name']}")
+            print(f"   描述: {info['description']}")
+            print(f"   內存需求: {info['memory_required']}")
+            print(f"   質量: {info['quality']}")
+            print(f"   下載命令: ollama pull {info['name']}")
+            print()

src/prompt_formatter.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""
+Prompt 格式化模組：將檢索結果格式化為 LLM 可讀的上下文
+"""
+from typing import List, Dict, Optional
+import re
+class PromptFormatter:
+    """格式化檢索結果供 LLM 使用"""
+    def __init__(
+        self,
+        include_metadata: bool = True,
+        format_style: str = "detailed",
+        max_context_length: Optional[int] = None,
+        auto_detect_language: bool = True
+    ):
+        """
+        初始化 Prompt 格式化器
+        Args:
+            include_metadata: 是否包含來源資訊
+            format_style: 格式風格 ("detailed", "simple", "minimal")
+            max_context_length: 最大上下文長度（字符數），None 表示不限制
+            auto_detect_language: 是否自動檢測語言並相應調整回答語言
+        """
+        self.include_metadata = include_metadata
+        self.format_style = format_style
+        self.max_context_length = max_context_length
+        self.auto_detect_language = auto_detect_language
+    @staticmethod
+    def detect_language(text: str) -> str:
+        """
+        檢測文本的主要語言
+        Args:
+            text: 輸入文本
+        Returns:
+            "zh" 表示中文，"en" 表示英文
+        """
+        # 檢查是否包含中文字符（CJK 統一表意文字範圍）
+        chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]')
+        chinese_chars = len(chinese_pattern.findall(text))
+        # 計算中文字符比例
+        total_chars = len([c for c in text if c.isalnum() or c.isspace()])
+        if total_chars == 0:
+            return "en"  # 預設英文
+        chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
+        # 如果中文字符比例超過 20%，認為是中文
+        if chinese_ratio > 0.2:
+            return "zh"
+        else:
+            return "en"
+    def get_system_prompt(self, language: str = "zh", document_type: str = "general") -> str:
+        """
+        根據語言和文檔類型獲取系統提示詞
+        Args:
+            language: 語言代碼 ("zh" 或 "en")
+            document_type: 文檔類型 ("paper", "cv", "general")
+                          "paper": 學術論文
+                          "cv": 履歷/履歷
+                          "general": 通用文檔（預設）
+        Returns:
+            系統提示詞字符串
+        """
+        if language == "zh":
+            if document_type == "paper":
+                return (
+                    "你是一個專業的 AI 研究助手，專門回答關於機器學習、"
+                    "深度學習和自然語言處理的問題。\n\n"
+                    "請基於以下提供的學術論文片段來回答用戶的問題。"
+                    "每個片段都標註了來源論文的資訊。\n\n"
+                    "回答要求：\n"
+                    "1. 基於提供的上下文回答問題\n"
+                    "2. 如果上下文不足以回答，請明確說明\n"
+                    "3. 在回答中引用具體的論文來源（使用 arXiv ID）\n"
+                    "4. 如果不同論文有不同觀點，請分別說明\n"
+                    "5. 保持回答簡潔、準確、專業\n"
+                    "6. **重要：請使用與用戶問題相同的語言回答**\n"
+                )
+            elif document_type == "cv":
+                return (
+                    "你是一個專業的 AI 助手，專門幫助分析和介紹簡歷（CV）內容。\n\n"
+                    "請基於以下提供的文檔片段來回答用戶的問題。"
+                    "這些片段來自一份簡歷或履歷表。\n\n"
+                    "回答要求：\n"
+                    "1. 基於提供的上下文回答問題\n"
+                    "2. 如果上下文不足以回答，請明確說明\n"
+                    "3. 在回答中引用具體的文檔內容\n"
+                    "4. 保持回答簡潔、準確、專業\n"
+                    "5. **重要：請使用與用戶問題相同的語言回答**\n"
+                    "6. **請理解：這些片段就是簡歷的內容，請直接基於這些內容回答問題**\n"
+                )
+            else:  # general
+                return (
+                    "你是一個專業的 AI 助手。\n\n"
+                    "請基於以下提供的文檔片段來回答用戶的問題。"
+                    "每個片段都標註了來源資訊。\n\n"
+                    "回答要求：\n"
+                    "1. 基於提供的上下文回答問題\n"
+                    "2. 如果上下文不足以回答，請明確說明\n"
+                    "3. 在回答中引用具體的文檔內容\n"
+                    "4. 保持回答簡潔、準確、專業\n"
+                    "5. **重要：請使用與用戶問題相同的語言回答**\n"
+                )
+        else:  # English
+            if document_type == "paper":
+                return (
+                    "You are a professional AI research assistant specializing in "
+                    "machine learning, deep learning, and natural language processing.\n\n"
+                    "Please answer the user's question based on the provided academic paper excerpts. "
+                    "Each excerpt is labeled with source paper information.\n\n"
+                    "Answer requirements:\n"
+                    "1. Answer the question based on the provided context\n"
+                    "2. If the context is insufficient, clearly state so\n"
+                    "3. Cite specific paper sources in your answer (using arXiv ID)\n"
+                    "4. If different papers have different viewpoints, explain them separately\n"
+                    "5. Keep answers concise, accurate, and professional\n"
+                    "6. **Important: Please answer in the same language as the user's question**\n"
+                )
+            elif document_type == "cv":
+                return (
+                    "You are a professional AI assistant specializing in analyzing and introducing CV (Curriculum Vitae) content.\n\n"
+                    "Please answer the user's question based on the provided document excerpts. "
+                    "These excerpts are from a CV or resume.\n\n"
+                    "Answer requirements:\n"
+                    "1. Answer the question based on the provided context\n"
+                    "2. If the context is insufficient, clearly state so\n"
+                    "3. Cite specific document content in your answer\n"
+                    "4. Keep answers concise, accurate, and professional\n"
+                    "5. **Important: Please answer in the same language as the user's question**\n"
+                    "6. **Please understand: These excerpts ARE the CV content. Please answer directly based on this content.**\n"
+                )
+            else:  # general
+                return (
+                    "You are a professional AI assistant.\n\n"
+                    "Please answer the user's question based on the provided document excerpts. "
+                    "Each excerpt is labeled with source information.\n\n"
+                    "Answer requirements:\n"
+                    "1. Answer the question based on the provided context\n"
+                    "2. If the context is insufficient, clearly state so\n"
+                    "3. Cite specific document content in your answer\n"
+                    "4. Keep answers concise, accurate, and professional\n"
+                    "5. **Important: Please answer in the same language as the user's question**\n"
+                )
+    def format_context(
+        self,
+        results: List[Dict],
+        include_metadata: Optional[bool] = None,
+        format_style: Optional[str] = None,
+        document_type: str = "general"
+    ) -> str:
+        """
+        格式化檢索結果為 LLM 可讀的上下文
+        Args:
+            results: 檢索結果列表
+            include_metadata: 是否包含來源資訊（覆蓋初始化參數）
+            format_style: 格式風格（覆蓋初始化參數）
+            document_type: 文檔類型 ("paper", "cv", "general")，用於調整格式
+        Returns:
+            格式化後的上下文字符串
+        """
+        if include_metadata is None:
+            include_metadata = self.include_metadata
+        if format_style is None:
+            format_style = self.format_style
+        if not results:
+            # 根據格式風格選擇語言
+            if format_style == "detailed" or format_style == "simple":
+                return "（未找到相關文檔片段）"
+            else:
+                return "(No relevant excerpts found)"
+        formatted_parts = []
+        for i, result in enumerate(results, 1):
+            content = result.get("content", "")
+            metadata = result.get("metadata", {})
+            if not include_metadata:
+                # 不包含來源資訊，直接使用內容
+                formatted_parts.append(f"{content}\n")
+            elif format_style == "detailed":
+                # 詳細格式：根據文檔類型調整顯示資訊
+                if document_type == "cv":
+                    # CV 格式：顯示檔案名和路徑
+                    source_info = (
+                        f"[來源 {i}]\n"
+                        f"檔案標題: {metadata.get('title', 'N/A')}\n"
+                    )
+                    if 'file_path' in metadata:
+                        source_info += f"檔案路徑: {metadata.get('file_path', 'N/A')}\n"
+                    if 'file_type' in metadata:
+                        source_info += f"檔案類型: {metadata.get('file_type', 'N/A')}\n"
+                elif document_type == "paper":
+                    # 論文格式：顯示論文資訊
+                    authors = metadata.get('authors', [])
+                    if isinstance(authors, str):
+                        authors_str = authors
+                    elif isinstance(authors, list):
+                        authors_str = ', '.join(authors[:3])  # 最多顯示 3 個作者
+                        if len(authors) > 3:
+                            authors_str += f" 等 {len(authors)} 位作者"
+                    else:
+                        authors_str = 'N/A'
+                    source_info = (
+                        f"[來源 {i}]\n"
+                        f"論文標題: {metadata.get('title', 'N/A')}\n"
+                        f"arXiv ID: {metadata.get('arxiv_id', 'N/A')}\n"
+                        f"作者: {authors_str}\n"
+                        f"發布日期: {metadata.get('published', 'N/A')}\n"
+                    )
+                else:
+                    # 通用格式：顯示可用的資訊
+                    source_info = f"[來源 {i}]\n"
+                    if 'title' in metadata:
+                        source_info += f"標題: {metadata.get('title', 'N/A')}\n"
+                    if 'file_path' in metadata:
+                        source_info += f"檔案: {metadata.get('file_path', 'N/A')}\n"
+                    if 'arxiv_id' in metadata:
+                        source_info += f"arXiv ID: {metadata.get('arxiv_id', 'N/A')}\n"
+                # 添加相關性分數（如果有的話）
+                rerank_score = result.get('rerank_score')
+                hybrid_score = result.get('hybrid_score')
+                if rerank_score is not None:
+                    source_info += f"相關性分數: {rerank_score:.4f}\n"
+                elif hybrid_score is not None:
+                    source_info += f"相關性分數: {hybrid_score:.4f}\n"
+                source_info += f"---\n{content}\n"
+                formatted_parts.append(source_info)
+            elif format_style == "simple":
+                # 簡單格式：只包含關鍵資訊
+                title = metadata.get('title', 'N/A')
+                if document_type == "paper" and 'arxiv_id' in metadata:
+                    arxiv_id = metadata.get('arxiv_id', 'N/A')
+                    source_info = (
+                        f"[來源 {i}: {title} "
+                        f"(arXiv:{arxiv_id})]\n"
+                        f"{content}\n"
+                    )
+                elif document_type == "cv" and 'file_path' in metadata:
+                    file_path = metadata.get('file_path', 'N/A')
+                    source_info = (
+                        f"[來源 {i}: {title} "
+                        f"({file_path})]\n"
+                        f"{content}\n"
+                    )
+                else:
+                    source_info = (
+                        f"[來源 {i}: {title}]\n"
+                        f"{content}\n"
+                    )
+                formatted_parts.append(source_info)
+            else:  # minimal
+                # 最小格式：只標註來源
+                if document_type == "paper" and 'arxiv_id' in metadata:
+                    arxiv_id = metadata.get('arxiv_id', 'N/A')
+                    source_info = (
+                        f"[arXiv:{arxiv_id}]\n"
+                        f"{content}\n"
+                    )
+                elif 'title' in metadata:
+                    title = metadata.get('title', 'N/A')
+                    source_info = (
+                        f"[{title}]\n"
+                        f"{content}\n"
+                    )
+                else:
+                    source_info = (
+                        f"[來源 {i}]\n"
+                        f"{content}\n"
+                    )
+                formatted_parts.append(source_info)
+        formatted_text = "\n" + "="*60 + "\n".join(formatted_parts)
+        # 如果設置了最大長度，進行截斷
+        if self.max_context_length and len(formatted_text) > self.max_context_length:
+            # 從後往前截斷，保留格式
+            formatted_text = formatted_text[:self.max_context_length]
+            # 確保最後一個來源資訊完整
+            last_separator = formatted_text.rfind("="*60)
+            if last_separator > 0:
+                formatted_text = formatted_text[:last_separator] + "\n（內容已截斷...）"
+        return formatted_text
+    def create_prompt(
+        self,
+        query: str,
+        context: str,
+        system_prompt: Optional[str] = None,
+        document_type: str = "general"
+    ) -> str:
+        """
+        創建完整的 LLM prompt
+        Args:
+            query: 用戶查詢
+            context: 格式化後的上下文
+            system_prompt: 可選的系統提示詞（如果為 None，會根據語言和文檔類型自動選擇）
+            document_type: 文檔類型 ("paper", "cv", "general")
+        Returns:
+            完整的 prompt 字符串
+        """
+        # 自動檢測語言並選擇相應的系統提示詞
+        if system_prompt is None and self.auto_detect_language:
+            detected_language = self.detect_language(query)
+            system_prompt = self.get_system_prompt(detected_language, document_type)
+        elif system_prompt is None:
+            # 如果禁用自動檢測，使用中文作為預設
+            system_prompt = self.get_system_prompt("zh", document_type)
+        # 根據檢測到的語言選擇提示詞格式
+        detected_language = self.detect_language(query) if self.auto_detect_language else "zh"
+        # 根據文檔類型選擇不同的提示詞結尾
+        if document_type == "paper":
+            if detected_language == "zh":
+                ending = "## 請基於上述文獻片段回答問題，並在回答中引用具體的論文來源。"
+            else:
+                ending = "## Please answer the question based on the above document excerpts and cite specific paper sources in your answer."
+        else:
+            if detected_language == "zh":
+                ending = "## 請基於上述文檔片段回答問題，並在回答中引用具體的文檔內容。"
+            else:
+                ending = "## Please answer the question based on the above document excerpts and cite specific document content in your answer."
+        if detected_language == "zh":
+            prompt = f"""{system_prompt}
+## 相關文檔片段：
+{context}
+## 用戶問題：
+{query}
+{ending}"""
+        else:  # English
+            prompt = f"""{system_prompt}
+## Relevant Document Excerpts:
+{context}
+## User Question:
+{query}
+{ending}"""
+        return prompt
+    def format_for_llm(
+        self,
+        query: str,
+        results: List[Dict],
+        system_prompt: Optional[str] = None,
+        document_type: str = "general"
+    ) -> str:
+        """
+        一站式方法：格式化檢索結果並創建完整的 prompt
+        Args:
+            query: 用戶查詢
+            results: 檢索結果列表
+            system_prompt: 可選的系統提示詞
+            document_type: 文檔類型 ("paper", "cv", "general")
+        Returns:
+            完整的 prompt 字符串
+        """
+        context = self.format_context(results, document_type=document_type)
+        return self.create_prompt(query, context, system_prompt, document_type)

src/retrievers/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+檢索器模組
+"""
+from .base import BaseRetriever
+from .bm25_retriever import BM25Retriever
+from .vector_retriever import VectorRetriever
+from .hybrid_search import HybridSearch
+from .reranker import Reranker, RAGPipeline
+__all__ = [
+    "BaseRetriever",
+    "BM25Retriever",
+    "VectorRetriever",
+    "HybridSearch",
+    "Reranker",
+    "RAGPipeline",
+]

src/retrievers/base.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+檢索器模組的抽象基類
+"""
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional
+class BaseRetriever(ABC):
+    """檢索器的抽象基類"""
+    @abstractmethod
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        檢索相關文檔並返回帶有分數的結果。
+        Args:
+            query: 查詢文字
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件字典。
+                            例如: {"arxiv_id": "1234.5678"} 或 {"title": "Machine Learning"}
+                            支援多個條件，所有條件必須同時滿足（AND 邏輯）
+        Returns:
+            相關文檔列表，每個文檔字典都應包含 "score" 鍵，
+            且分數越高代表越相關。返回的結果會根據 metadata_filter 進行過濾。
+        """
+        pass

src/retrievers/bm25_retriever.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+BM25 檢索器模組
+"""
+from typing import List, Dict, Optional
+from rank_bm25 import BM25Okapi
+import re
+from .base import BaseRetriever
+class BM25Retriever(BaseRetriever):
+    """使用 BM25 演算法進行關鍵字檢索"""
+    def __init__(self, documents: List[Dict]):
+        """
+        初始化 BM25 檢索器
+        Args:
+            documents: 文檔列表，每個文檔包含 "content" 和 "metadata"
+        """
+        self.documents = documents
+        self.texts = [doc["content"] for doc in documents]
+        # 對文字進行 tokenization（簡單的分詞）
+        tokenized_texts = [self._tokenize(text) for text in self.texts]
+        # 初始化 BM25
+        self.bm25 = BM25Okapi(tokenized_texts)
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        將文字轉換為 tokens（簡單的實作）
+        Args:
+            text: 輸入文字
+        Returns:
+            token 列表
+        """
+        # 轉為小寫並分割
+        text = text.lower()
+        # 使用正則表達式分割（保留字母和數字）
+        tokens = re.findall(r'\b\w+\b', text)
+        return tokens
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        檢索相關文檔，支援根據 metadata 進行過濾。
+        Args:
+            query: 查詢文字
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件字典。
+                            例如: {"arxiv_id": "1234.5678"} 只檢索特定論文的 chunks
+                            或 {"title": "Machine Learning"} 只檢索特定標題的論文
+                            支援多個條件，所有條件必須同時滿足（AND 邏輯）
+                            注意：BM25 的過濾是在檢索後進行的，所以可能會返回少於 top_k 的結果
+        Returns:
+            相關文檔列表，每個包含 "content", "metadata", "score"
+            結果會根據 metadata_filter 進行過濾
+        """
+        # Tokenize 查詢
+        tokenized_query = self._tokenize(query)
+        # 計算 BM25 分數
+        scores = self.bm25.get_scores(tokenized_query)
+        # 獲取所有結果並排序（先獲取更多結果以應對過濾後可能減少的情況）
+        # 如果沒有過濾條件，只需要 top_k 個；如果有過濾條件，需要更多候選結果
+        candidate_k = top_k * 3 if metadata_filter else top_k
+        # 獲取候選結果索引（按分數降序排列）
+        sorted_indices = sorted(
+            range(len(scores)),
+            key=lambda i: scores[i],
+            reverse=True
+        )[:candidate_k]
+        # 構建候選結果
+        candidate_results = []
+        for idx in sorted_indices:
+            candidate_results.append({
+                "content": self.documents[idx]["content"],
+                "metadata": self.documents[idx]["metadata"],
+                "score": float(scores[idx]),
+            })
+        # 如果提供了 metadata_filter，則進行過濾
+        if metadata_filter:
+            filtered_results = []
+            for result in candidate_results:
+                # 檢查該結果的 metadata 是否滿足所有過濾條件
+                metadata = result.get("metadata", {})
+                matches_all = True
+                for filter_key, filter_value in metadata_filter.items():
+                    # 獲取文檔中對應的 metadata 值
+                    doc_value = metadata.get(filter_key)
+                    # 檢查是否匹配
+                    # 支援精確匹配和部分匹配（如果 filter_value 是字串且 doc_value 也是字串）
+                    if isinstance(filter_value, str) and isinstance(doc_value, str):
+                        # 字串匹配：支援精確匹配或包含匹配
+                        if filter_value.lower() not in doc_value.lower():
+                            matches_all = False
+                            break
+                    else:
+                        # 其他類型（數字、布林值等）使用精確匹配
+                        if doc_value != filter_value:
+                            matches_all = False
+                            break
+                # 如果所有條件都滿足，則加入結果
+                if matches_all:
+                    filtered_results.append(result)
+            # 返回過濾後的結果（最多 top_k 個）
+            return filtered_results[:top_k]
+        else:
+            # 沒有過濾條件，直接返回候選結果
+            return candidate_results

src/retrievers/hybrid_search.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+Hybrid Search 模組：結合 BM25 和向量檢索
+支援兩種融合方法：加權求和（Weighted Sum）和倒數排名融合（RRF）
+"""
+from typing import List, Dict, Optional, Literal
+from .base import BaseRetriever
+import numpy as np
+class HybridSearch(BaseRetriever):
+    """結合稀疏和密集檢索的混合搜尋"""
+    def __init__(
+        self,
+        sparse_retriever: BaseRetriever,
+        dense_retriever: BaseRetriever,
+        sparse_weight: float = 0.4,
+        dense_weight: float = 0.6,
+        fusion_method: Literal["weighted_sum", "rrf"] = "rrf",
+        rrf_k: int = 60,
+    ):
+        """
+        初始化 Hybrid Search
+        Args:
+            sparse_retriever: 稀疏檢索器 (例如 BM25)
+            dense_retriever: 密集檢索器 (例如向量檢索)
+            sparse_weight: 稀疏檢索分數的權重（僅用於 weighted_sum 方法）
+            dense_weight: 密集檢索分數的權重（僅用於 weighted_sum 方法）
+            fusion_method: 融合方法，可選 "weighted_sum" 或 "rrf"
+                          - "weighted_sum": 加權求和，需要正規化分數並設置權重
+                          - "rrf": 倒數排名融合（Reciprocal Rank Fusion），
+                                  不需要分數正規化，對不同分數分佈更魯棒
+            rrf_k: RRF 方法中的常數 k，通常設為 60（僅用於 rrf 方法）
+                   較大的 k 值會讓排名較低的文檔獲得更多權重
+        """
+        self.sparse_retriever = sparse_retriever
+        self.dense_retriever = dense_retriever
+        self.fusion_method = fusion_method
+        self.rrf_k = rrf_k
+        # 僅在 weighted_sum 方法中使用權重
+        if fusion_method == "weighted_sum":
+            self.sparse_weight = sparse_weight
+            self.dense_weight = dense_weight
+            # 確保權重總和為 1
+            total_weight = sparse_weight + dense_weight
+            if abs(total_weight - 1.0) > 1e-6:
+                self.sparse_weight = sparse_weight / total_weight
+                self.dense_weight = dense_weight / total_weight
+    def _normalize_scores(self, results: List[Dict]) -> List[Dict]:
+        """
+        將分數正規化到 [0, 1] 區間。
+        僅用於 weighted_sum 方法。
+        Args:
+            results: 檢索結果列表，每個字典包含 'score'
+        Returns:
+            帶有正規化分數的結果列表
+        """
+        scores = [res.get("score", 0.0) for res in results]
+        if not scores:
+            return results
+        scores_array = np.array(scores)
+        min_score = scores_array.min()
+        max_score = scores_array.max()
+        if max_score == min_score:
+            # 如果所有分數都相同，將它們設置為 1.0
+            normalized_scores = [1.0] * len(scores)
+        else:
+            normalized_scores = ((scores_array - min_score) / (max_score - min_score)).tolist()
+        for i, res in enumerate(results):
+            res["score"] = normalized_scores[i]
+        return results
+    def _get_doc_id(self, doc: Dict) -> str:
+        """
+        從文檔中提取唯一標識符
+        Args:
+            doc: 文檔字典
+        Returns:
+            文檔的唯一 ID
+        """
+        metadata = doc.get("metadata", {})
+        return f"{metadata.get('arxiv_id', 'unknown')}_{metadata.get('chunk_index', 0)}"
+    def _apply_rrf(
+        self,
+        sparse_results: List[Dict],
+        dense_results: List[Dict]
+    ) -> List[Dict]:
+        """
+        應用倒數排名融合（Reciprocal Rank Fusion, RRF）方法
+        RRF 公式：RRF(d) = Σ(1 / (k + rank_i(d)))
+        其中：
+        - d 是文檔
+        - rank_i(d) 是文檔在第 i 個檢索結果中的排名（從 1 開始）
+        - k 是常數（預設為 60）
+        RRF 的優點：
+        1. 不需要分數正規化，對不同分數分佈的檢索器更魯棒
+        2. 只依賴排名位置，不依賴分數值
+        3. 自動處理分數分佈差異的問題
+        Args:
+            sparse_results: 稀疏檢索結果列表
+            dense_results: 密集檢索結果列表
+        Returns:
+            融合後的結果列表，按 RRF 分數排序
+        """
+        # 建立文檔 ID 到 RRF 分數的映射
+        doc_to_rrf_score = {}
+        # 處理稀疏檢索結果（BM25）
+        for rank, result in enumerate(sparse_results, start=1):
+            doc_id = self._get_doc_id(result)
+            if doc_id not in doc_to_rrf_score:
+                doc_to_rrf_score[doc_id] = {
+                    "doc": result,
+                    "rrf_score": 0.0,
+                    "sparse_rank": None,
+                    "dense_rank": None
+                }
+            # 計算 RRF 貢獻：1 / (k + rank)
+            doc_to_rrf_score[doc_id]["rrf_score"] += 1.0 / (self.rrf_k + rank)
+            doc_to_rrf_score[doc_id]["sparse_rank"] = rank
+        # 處理密集檢索結果（向量）
+        for rank, result in enumerate(dense_results, start=1):
+            doc_id = self._get_doc_id(result)
+            if doc_id not in doc_to_rrf_score:
+                doc_to_rrf_score[doc_id] = {
+                    "doc": result,
+                    "rrf_score": 0.0,
+                    "sparse_rank": None,
+                    "dense_rank": None
+                }
+            # 計算 RRF 貢獻：1 / (k + rank)
+            doc_to_rrf_score[doc_id]["rrf_score"] += 1.0 / (self.rrf_k + rank)
+            doc_to_rrf_score[doc_id]["dense_rank"] = rank
+        # 構建結果列表
+        rrf_results = []
+        for doc_id, data in doc_to_rrf_score.items():
+            result = data["doc"].copy()
+            result["hybrid_score"] = data["rrf_score"]
+            result["rrf_score"] = data["rrf_score"]
+            result["sparse_rank"] = data["sparse_rank"]
+            result["dense_rank"] = data["dense_rank"]
+            # 從原始結果中獲取分數以供參考
+            if data["sparse_rank"] is not None:
+                # 從稀疏檢索結果中獲取原始分數
+                for sparse_res in sparse_results:
+                    if self._get_doc_id(sparse_res) == doc_id:
+                        result["sparse_score"] = sparse_res.get("score", 0.0)
+                        break
+            else:
+                result["sparse_score"] = None
+            if data["dense_rank"] is not None:
+                # 從密集檢索結果中獲取原始分數
+                for dense_res in dense_results:
+                    if self._get_doc_id(dense_res) == doc_id:
+                        result["dense_score"] = dense_res.get("score", 0.0)
+                        break
+            else:
+                result["dense_score"] = None
+            rrf_results.append(result)
+        # 按 RRF 分數從高到低排序
+        rrf_results.sort(key=lambda x: x["rrf_score"], reverse=True)
+        return rrf_results
+    def _apply_weighted_sum(
+        self,
+        sparse_results: List[Dict],
+        dense_results: List[Dict]
+    ) -> List[Dict]:
+        """
+        應用加權求和（Weighted Sum）方法
+        此方法需要：
+        1. 正規化兩組分數到相同範圍
+        2. 根據權重進行加權求和
+        Args:
+            sparse_results: 稀疏檢索結果列表
+            dense_results: 密集檢索結果列表
+        Returns:
+            融合後的結果列表，按混合分數排序
+        """
+        # 正規化兩組分數
+        normalized_sparse = self._normalize_scores(sparse_results)
+        normalized_dense = self._normalize_scores(dense_results)
+        # 結合分數
+        doc_to_scores = {}
+        # 處理稀疏檢索結果
+        for res in normalized_sparse:
+            doc_id = self._get_doc_id(res)
+            if doc_id not in doc_to_scores:
+                doc_to_scores[doc_id] = {"doc": res, "sparse": 0.0, "dense": 0.0}
+            doc_to_scores[doc_id]["sparse"] = res["score"]
+        # 處理密集檢索結果
+        for res in normalized_dense:
+            doc_id = self._get_doc_id(res)
+            if doc_id not in doc_to_scores:
+                doc_to_scores[doc_id] = {"doc": res, "sparse": 0.0, "dense": 0.0}
+            doc_to_scores[doc_id]["dense"] = res["score"]
+        # 計算混合分數並排序
+        hybrid_results = []
+        for doc_id, scores in doc_to_scores.items():
+            hybrid_score = (
+                self.sparse_weight * scores["sparse"] +
+                self.dense_weight * scores["dense"]
+            )
+            result = scores["doc"].copy()
+            result["hybrid_score"] = hybrid_score
+            result["sparse_score"] = scores["sparse"]
+            result["dense_score"] = scores["dense"]
+            hybrid_results.append(result)
+        # 按混合分數從高到低排序
+        hybrid_results.sort(key=lambda x: x["hybrid_score"], reverse=True)
+        return hybrid_results
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        執行混合搜尋，支援根據 metadata 進行過濾
+        Args:
+            query: 查詢文字
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件字典。
+                            例如: {"arxiv_id": "1234.5678"} 只檢索特定論文的 chunks
+                            或 {"title": "Machine Learning"} 只檢索特定標題的論文
+                            支援多個條件，所有條件必須同時滿足（AND 邏輯）
+                            此過濾條件會傳遞給底層的稀疏和密集檢索器
+        Returns:
+            相關文檔列表，每個包含 "content", "metadata", "hybrid_score"
+            結果會根據 metadata_filter 進行過濾
+            根據 fusion_method 的不同，返回的結果會包含不同的分數欄位：
+            - RRF 方法：包含 "rrf_score", "sparse_rank", "dense_rank"
+            - Weighted Sum 方法：包含 "sparse_score", "dense_score"
+        """
+        # 1. 從兩個檢索器獲取結果（請求更多結果以確保覆蓋率）
+        # 將 metadata_filter 傳遞給底層檢索器
+        sparse_results = self.sparse_retriever.retrieve(
+            query,
+            top_k=top_k * 2,
+            metadata_filter=metadata_filter
+        )
+        dense_results = self.dense_retriever.retrieve(
+            query,
+            top_k=top_k * 2,
+            metadata_filter=metadata_filter
+        )
+        # 2. 根據選擇的融合方法進行結果融合
+        if self.fusion_method == "rrf":
+            # 使用 RRF（倒數排名融合）方法
+            # RRF 不需要分數正規化，直接基於排名進行融合
+            hybrid_results = self._apply_rrf(sparse_results, dense_results)
+        else:
+            # 使用加權求和方法
+            # 需要先正規化分數，然後根據權重進行加權求和
+            hybrid_results = self._apply_weighted_sum(sparse_results, dense_results)
+        # 3. 返回前 top_k 個結果
+        return hybrid_results[:top_k]

src/retrievers/reranker.py ADDED Viewed

	@@ -0,0 +1,448 @@

+"""
+重排序模組：使用 Cross-Encoder 進行精準重排
+"""
+from typing import List, Dict, Optional, Tuple
+from sentence_transformers import CrossEncoder
+import time
+import logging
+# 嘗試導入 torch 來檢測可用的設備
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_device() -> str:
+    """
+    自動檢測並返回最佳可用的設備
+    Returns:
+        設備名稱: 'mps' (macOS GPU), 'cuda' (NVIDIA GPU), 或 'cpu'
+    """
+    if not TORCH_AVAILABLE:
+        return 'cpu'
+    # 優先順序: MPS (macOS) > CUDA (NVIDIA) > CPU
+    if torch.backends.mps.is_available():
+        return 'mps'
+    elif torch.cuda.is_available():
+        return 'cuda'
+    else:
+        return 'cpu'
+class Reranker:
+    """重排序組件：使用 Cross-Encoder 進行精準重排"""
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-reranker-base",
+        device: str = None,
+        max_length: int = 512,
+        batch_size: int = 32,
+        enable_cache: bool = True
+    ):
+        """
+        初始化 Cross-Encoder 模型
+        Args:
+            model_name: Cross-Encoder 模型名稱
+            device: 設備名稱 ('cuda', 'cpu', 'mps')
+            max_length: 最大 token 長度（模型限制）
+            batch_size: 批處理大小，用於優化內存使用
+            enable_cache: 是否啟用模型緩存
+        """
+        try:
+            # 自動檢測設備（如果未指定）
+            if device is None:
+                device = get_device()
+            device_name_map = {
+                'mps': 'MPS (macOS GPU)',
+                'cuda': 'CUDA (NVIDIA GPU)',
+                'cpu': 'CPU'
+            }
+            device_display = device_name_map.get(device, device)
+            self.model = CrossEncoder(
+                model_name,
+                device=device,
+                max_length=max_length
+            )
+            self.max_length = max_length
+            self.batch_size = batch_size
+            self.model_name = model_name
+            logger.info(f"✅ 重排模型 {model_name} 已載入 (device: {device_display})")
+        except Exception as e:
+            logger.error(f"❌ 模型載入失敗: {e}")
+            raise
+    def _truncate_text(self, text: str, max_chars: int = 2000) -> str:
+        """
+        截斷過長的文本（粗略估計，避免超過 token 限制）
+        Args:
+            text: 原始文本
+            max_chars: 最大字符數（保守估計，約 500 tokens）
+        Returns:
+            截斷後的文本
+        """
+        if len(text) <= max_chars:
+            return text
+        # 截斷並添加省略號
+        return text[:max_chars - 3] + "..."
+    def _prepare_pairs(
+        self,
+        query: str,
+        documents: List[Dict]
+    ) -> List[Tuple[str, str]]:
+        """
+        準備 (query, document) 配對，處理文本長度
+        Args:
+            query: 查詢文本
+            documents: 文檔列表
+        Returns:
+            (query, content) 配對列表
+        """
+        pairs = []
+        truncated_indices = []  # 記錄哪些文檔被截斷了
+        # 粗略估計：每個字符約 0.25 tokens，為 query 預留空間
+        max_doc_chars = int((self.max_length * 0.7) - len(query))
+        for i, doc in enumerate(documents):
+            content = doc.get("content", "")
+            original_length = len(content)
+            # 如果內容過長，進行截斷
+            if len(content) > max_doc_chars:
+                content = self._truncate_text(content, max_doc_chars)
+                truncated_indices.append(i)
+            pairs.append([query, content])
+        if truncated_indices:
+            logger.warning(
+                f"⚠️  有 {len(truncated_indices)} 個文檔因過長被截斷 "
+                f"(最大長度: {max_doc_chars} 字符)"
+            )
+        return pairs
+    def rerank(
+        self,
+        query: str,
+        documents: List[Dict],
+        top_k: int = 5,
+        preserve_original_scores: bool = True
+    ) -> List[Dict]:
+        """
+        執行精準重排
+        Args:
+            query: 查詢文本
+            documents: 文檔列表，每個應包含 "content" 和可選的 "hybrid_score"
+            top_k: 返回前 k 個結果
+            preserve_original_scores: 是否保留原始分數（hybrid_score）
+        Returns:
+            重排後的文檔列表，按 rerank_score 降序排列
+        """
+        if not documents:
+            logger.warning("⚠️  文檔列表為空，返回空結果")
+            return []
+        if not query or not query.strip():
+            logger.warning("⚠️  查詢為空，返回原始文檔順序")
+            return documents[:top_k]
+        start_time = time.time()
+        logger.info(f"🔄 開始重排 {len(documents)} 個文檔...")
+        try:
+            # 1. 準備配對
+            pairs = self._prepare_pairs(query, documents)
+            # 2. 批處理計算分數（優化內存使用）
+            scores = []
+            for i in range(0, len(pairs), self.batch_size):
+                batch_pairs = pairs[i:i + self.batch_size]
+                batch_scores = self.model.predict(batch_pairs)
+                scores.extend(batch_scores.tolist() if hasattr(batch_scores, 'tolist') else batch_scores)
+            # 3. 更新文檔分數
+            for i, doc in enumerate(documents):
+                doc = doc.copy()  # 避免修改原始文檔
+                doc["rerank_score"] = float(scores[i])
+                # 保留原始分數供參考
+                if preserve_original_scores:
+                    if "hybrid_score" not in doc:
+                        # 如果沒有 hybrid_score，嘗試使用其他分數
+                        doc["original_score"] = doc.get("score", 0.0)
+                documents[i] = doc
+            # 4. 根據 rerank_score 重新排序
+            reranked_docs = sorted(
+                documents,
+                key=lambda x: x.get("rerank_score", float('-inf')),
+                reverse=True
+            )
+            # 5. 統計資訊
+            elapsed_time = time.time() - start_time
+            avg_score = sum(scores) / len(scores) if scores else 0.0
+            max_score = max(scores) if scores else 0.0
+            min_score = min(scores) if scores else 0.0
+            logger.info(
+                f"✅ 重排完成 (耗時: {elapsed_time:.2f}s, "
+                f"平均分數: {avg_score:.4f}, "
+                f"範圍: [{min_score:.4f}, {max_score:.4f}])"
+            )
+            return reranked_docs[:top_k]
+        except Exception as e:
+            logger.error(f"❌ 重排過程出錯: {e}")
+            # 降級策略：返回原始順序的前 top_k 個
+            logger.warning("⚠️  使用降級策略：返回原始順序")
+            return documents[:top_k]
+class RAGPipeline:
+    """協調管線：管理完整的 RAG 流程（召回 + 重排）"""
+    def __init__(
+        self,
+        hybrid_search,
+        reranker,
+        recall_k: int = 25,
+        adaptive_recall: bool = True,
+        min_recall_k: int = 10,
+        max_recall_k: int = 50
+    ):
+        """
+        初始化 RAG 管線
+        Args:
+            hybrid_search: HybridSearch 實例
+            reranker: Reranker 實例
+            recall_k: 第一階段召回的數量（預設值）
+            adaptive_recall: 是否根據查詢動態調整 recall_k
+            min_recall_k: 最小召回數量
+            max_recall_k: 最大召回數量
+        """
+        self.hybrid_search = hybrid_search
+        self.reranker = reranker
+        self.base_recall_k = recall_k
+        self.adaptive_recall = adaptive_recall
+        self.min_recall_k = min_recall_k
+        self.max_recall_k = max_recall_k
+        # 性能統計
+        self.stats = {
+            "total_queries": 0,
+            "avg_recall_time": 0.0,
+            "avg_rerank_time": 0.0,
+            "avg_total_time": 0.0
+        }
+    def _calculate_adaptive_recall_k(self, query: str) -> int:
+        """
+        根據查詢複雜度動態計算 recall_k
+        Args:
+            query: 查詢文本
+        Returns:
+            調整後的 recall_k
+        """
+        if not self.adaptive_recall:
+            return self.base_recall_k
+        # 簡單啟發式：根據查詢長度和關鍵詞數量調整
+        query_length = len(query.split())
+        keyword_count = len(set(query.lower().split()))
+        # 複雜查詢需要更多候選
+        if query_length > 10 or keyword_count > 5:
+            recall_k = min(self.base_recall_k * 2, self.max_recall_k)
+        elif query_length < 3:
+            recall_k = max(self.base_recall_k // 2, self.min_recall_k)
+        else:
+            recall_k = self.base_recall_k
+        return recall_k
+    def query(
+        self,
+        text: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        enable_rerank: bool = True,
+        return_stats: bool = False
+    ) -> List[Dict]:
+        """
+        執行完整的搜尋流程
+        Args:
+            text: 查詢文本
+            top_k: 最終返回的結果數量
+            metadata_filter: 可選的 metadata 過濾條件
+            enable_rerank: 是否啟用重排序（可選，用於性能測試）
+            return_stats: 是否返回���能統計資訊
+        Returns:
+            相關文檔列表，如果 return_stats=True，則返回 (results, stats) 元組
+        """
+        if not text or not text.strip():
+            logger.warning("⚠️  查詢為空")
+            return []
+        total_start = time.time()
+        self.stats["total_queries"] += 1
+        # 動態計算 recall_k
+        recall_k = self._calculate_adaptive_recall_k(text)
+        logger.info(
+            f"🔍 搜尋中: '{text[:50]}...' "
+            f"(召回階段: {recall_k} 筆, 最終返回: {top_k} 筆)"
+        )
+        try:
+            # 第一階段：混合搜尋（召回階段）
+            recall_start = time.time()
+            initial_results = self.hybrid_search.retrieve(
+                query=text,
+                top_k=recall_k,
+                metadata_filter=metadata_filter
+            )
+            recall_time = time.time() - recall_start
+            if not initial_results:
+                logger.warning("⚠️  召回階段未找到任何結果")
+                return []
+            logger.info(
+                f"✅ 召回階段完成: 找到 {len(initial_results)} 個候選 "
+                f"(耗時: {recall_time:.2f}s)"
+            )
+            # 第二階段：重排序（精篩階段）
+            if enable_rerank and len(initial_results) > top_k:
+                rerank_start = time.time()
+                final_results = self.reranker.rerank(
+                    query=text,
+                    documents=initial_results,
+                    top_k=top_k
+                )
+                rerank_time = time.time() - rerank_start
+                logger.info(
+                    f"✅ 重排階段完成: 從 {len(initial_results)} 個候選中選出 "
+                    f"{len(final_results)} 個結果 (耗時: {rerank_time:.2f}s)"
+                )
+            else:
+                # 跳過重排序（用於性能測試或候選數較少時）
+                final_results = initial_results[:top_k]
+                rerank_time = 0.0
+                logger.info("⏭️  跳過重排序階段（候選數不足或已禁用）")
+            # 更新統計資訊
+            total_time = time.time() - total_start
+            self._update_stats(recall_time, rerank_time, total_time)
+            # 添加性能資訊到結果（可選）
+            if return_stats:
+                stats = {
+                    "recall_time": recall_time,
+                    "rerank_time": rerank_time,
+                    "total_time": total_time,
+                    "recall_k": recall_k,
+                    "candidates_found": len(initial_results),
+                    "final_results": len(final_results)
+                }
+                return final_results, stats
+            return final_results
+        except Exception as e:
+            logger.error(f"❌ 查詢過程出錯: {e}")
+            # 降級策略：嘗試只使用召回階段
+            try:
+                logger.warning("⚠️  嘗試降級策略：僅使用召回結果")
+                return self.hybrid_search.retrieve(text, top_k=top_k, metadata_filter=metadata_filter)
+            except Exception as e2:
+                logger.error(f"❌ 降級策略也失敗: {e2}")
+                return []
+    def _update_stats(self, recall_time: float, rerank_time: float, total_time: float):
+        """更新性能統計資訊"""
+        n = self.stats["total_queries"]
+        self.stats["avg_recall_time"] = (
+            (self.stats["avg_recall_time"] * (n - 1) + recall_time) / n
+        )
+        self.stats["avg_rerank_time"] = (
+            (self.stats["avg_rerank_time"] * (n - 1) + rerank_time) / n
+        )
+        self.stats["avg_total_time"] = (
+            (self.stats["avg_total_time"] * (n - 1) + total_time) / n
+        )
+    def get_stats(self) -> Dict:
+        """獲取性能統計資訊"""
+        return self.stats.copy()
+    def reset_stats(self):
+        """重置統計資訊"""
+        self.stats = {
+            "total_queries": 0,
+            "avg_recall_time": 0.0,
+            "avg_rerank_time": 0.0,
+            "avg_total_time": 0.0
+        }
+    def format_results_for_llm(
+        self,
+        results: List[Dict],
+        format_style: str = "detailed"
+    ) -> str:
+        """
+        格式化檢索結果供 LLM 使用（需要導入 PromptFormatter）
+        Args:
+            results: 檢索結果列表
+            format_style: 格式風格 ("detailed", "simple", "minimal")
+        Returns:
+            格式化後的上下文字符串
+        """
+        try:
+            from ..prompt_formatter import PromptFormatter
+            formatter = PromptFormatter(format_style=format_style)
+            return formatter.format_context(results)
+        except ImportError:
+            # 如果無法導入，使用簡單格式
+            formatted_parts = []
+            for i, result in enumerate(results, 1):
+                metadata = result.get("metadata", {})
+                content = result.get("content", "")
+                arxiv_id = metadata.get('arxiv_id', 'N/A')
+                title = metadata.get('title', 'N/A')
+                formatted_parts.append(
+                    f"[來源 {i}: {title} (arXiv:{arxiv_id})]\n{content}\n"
+                )
+            return "\n" + "="*60 + "\n".join(formatted_parts)

src/retrievers/vector_retriever.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+向量檢索器模組：使用 embedding 和向量資料庫進行語義檢索
+支援兩種初始化方式：
+1. 自動初始化 embeddings（預設）：根據參數創建新的 embedding 模型
+2. 使用外部 embeddings：接收已初始化的 embedding 模型（可與 DocumentProcessor 共用）
+"""
+from typing import List, Dict, Optional, Any
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+import os
+from .base import BaseRetriever
+# 嘗試導入 HuggingFaceEmbeddings（免費模型）
+try:
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+except ImportError:
+    try:
+        from langchain_huggingface import HuggingFaceEmbeddings
+    except ImportError:
+        raise ImportError("需要安裝 langchain-community 或 langchain-huggingface 才能使用 Hugging Face embeddings")
+# 導入 torch 來檢測可用的設備
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+def get_device() -> str:
+    """
+    自動檢測並返回最佳可用的設備
+    Returns:
+        設備名稱: 'mps' (macOS GPU), 'cuda' (NVIDIA GPU), 或 'cpu'
+    """
+    if not TORCH_AVAILABLE:
+        return 'cpu'
+    # 優先順序: MPS (macOS) > CUDA (NVIDIA) > CPU
+    if torch.backends.mps.is_available():
+        return 'mps'
+    elif torch.cuda.is_available():
+        return 'cuda'
+    else:
+        return 'cpu'
+class VectorRetriever(BaseRetriever):
+    """使用向量檢索進行語義搜尋"""
+    def __init__(
+        self,
+        documents: List[Dict],
+        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        persist_directory: Optional[str] = "./chroma_db",
+        hf_cache_dir: Optional[str] = None,
+        device: Optional[str] = None,
+        embeddings: Optional[Any] = None  # 可選：外部傳入的 embedding 模型（優先使用）
+    ):
+        """
+        初始化向量檢索器（使用 Hugging Face embeddings）
+        Args:
+            documents: 文檔列表，每個文檔包含 "content" 和 "metadata"
+            embedding_model: Hugging Face embedding 模型名稱（預設: "sentence-transformers/all-MiniLM-L6-v2"）
+                            僅在 embeddings=None 時使用
+            persist_directory: Chroma 資料庫持久化目錄
+            hf_cache_dir: Hugging Face 模型緩存目錄（例如外接硬碟路徑）
+                         如果為 None，則使用環境變數 HF_HOME 或預設位置 ~/.cache/huggingface/
+                         僅在 embeddings=None 時使用
+            device: 設備名稱 ('mps', 'cuda', 'cpu')，如果為 None 則自動檢測最佳設備
+                   僅在 embeddings=None 時使用
+            embeddings: 可選的外部 embedding 模型物件
+                       如果提供，將優先使用此模型，忽略其他參數（embedding_model, hf_cache_dir, device）
+                       這允許與 DocumentProcessor 共用同一個 embedding 模型實例
+                       優點：
+                       - 節省內存（只加載一次模型）
+                       - 節省時間（避免重複初始化）
+                       - 確保一致性（分塊和檢索使用相同的模型）
+        """
+        # 優先使用傳入的共用模型
+        if embeddings is not None:
+            self.embeddings = embeddings
+            print("✓ 使用外部傳入的 embeddings 模型（與 DocumentProcessor 共用）")
+        else:
+            # 若無傳入，則執行原有的初始化邏輯
+            print(f"使用 Hugging Face embedding 模型: {embedding_model}")
+            # 設置 Hugging Face 緩存目錄
+            if hf_cache_dir:
+                # 如果指定了緩存目錄，設置環境變數
+                os.environ['HF_HOME'] = hf_cache_dir
+                os.environ['TRANSFORMERS_CACHE'] = hf_cache_dir
+                print(f"模型將存儲在: {hf_cache_dir}")
+            else:
+                # 檢查是否已經設置了環境變數
+                default_cache = os.path.expanduser("~/.cache/huggingface")
+                current_cache = os.getenv('HF_HOME', default_cache)
+                print(f"模型緩存位置: {current_cache}")
+                print("提示: 可以通過設置 hf_cache_dir 參數或環境變數 HF_HOME 來指定外接硬碟路徑")
+            # 自動檢測或使用指定的設備
+            if device is None:
+                device = get_device()
+            device_name_map = {
+                'mps': 'MPS (macOS GPU)',
+                'cuda': 'CUDA (NVIDIA GPU)',
+                'cpu': 'CPU'
+            }
+            print(f"使用設備: {device_name_map.get(device, device)}")
+            print("首次使用時會下載模型，請稍候...")
+            # 構建 model_kwargs，包含緩存目錄和設備
+            model_kwargs = {'device': device}
+            if hf_cache_dir:
+                model_kwargs['cache_dir'] = hf_cache_dir
+            self.embeddings = HuggingFaceEmbeddings(
+                model_name=embedding_model,
+                model_kwargs=model_kwargs,
+                encode_kwargs={'normalize_embeddings': True}  # 正規化 embeddings 以提升效果
+            )
+        # 將文檔轉換為 LangChain Document 格式
+        # 需要將 metadata 中的列表轉換為字串，因為 ChromaDB 不接受列表類型
+        def sanitize_metadata(metadata: Dict) -> Dict:
+            """將 metadata 中的列表轉換為字串，以符合 ChromaDB 的要求"""
+            sanitized = {}
+            for key, value in metadata.items():
+                if isinstance(value, list):
+                    # 將列表轉換為逗號分隔的字串
+                    sanitized[key] = ", ".join(str(v) for v in value)
+                elif isinstance(value, (dict, set)):
+                    # 將字典或集合轉換為字串
+                    sanitized[key] = str(value)
+                else:
+                    # 其他類型（str, int, float, bool, None）直接保留
+                    sanitized[key] = value
+            return sanitized
+        langchain_docs = [
+            Document(
+                page_content=doc["content"],
+                metadata=sanitize_metadata(doc["metadata"])
+            )
+            for doc in documents
+        ]
+        # 創建向量資料庫
+        self.vectorstore = Chroma.from_documents(
+            documents=langchain_docs,
+            embedding=self.embeddings,
+            persist_directory=persist_directory
+        )
+        # 創建 retriever
+        self.retriever = self.vectorstore.as_retriever()
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        檢索相關文檔，並返回標準化的相似度分數（越高越好）。
+        支援根據 metadata 進行過濾。
+        Args:
+            query: 查詢文字
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件字典。
+                            例如: {"arxiv_id": "1234.5678"} 只檢索特定論文的 chunks
+                            或 {"title": "Machine Learning"} 只檢索特定標題的論文
+                            支援多個條件，所有條件必須同時滿足（AND 邏輯）
+                            注意：ChromaDB 的 where 條件支援精確匹配，不支援部分匹配
+        Returns:
+            相關文檔列表，每個包含 "content", "metadata", 和 "score"
+            結果會根據 metadata_filter 進行過濾
+        """
+        # 構建過濾條件
+        # 如果提供了 metadata_filter，先獲取更多結果，然後在 Python 中進行過濾
+        # 這是因為 LangChain ChromaDB 的 similarity_search_with_score 方法
+        # 對 filter 參數的支援可能因版本而異
+        if metadata_filter:
+            # 獲取更多結果以確保有足夠的候選進行過濾
+            results_with_scores = self.vectorstore.similarity_search_with_score(
+                query,
+                k=top_k * 10  # 獲取更多結果
+            )
+            # 在 Python 中進行過濾
+            filtered_results = []
+            for doc, distance_score in results_with_scores:
+                metadata = doc.metadata
+                matches = True
+                for key, value in metadata_filter.items():
+                    doc_value = metadata.get(key)
+                    # 檢查是否匹配
+                    if isinstance(value, dict):
+                        # 支援運算符格式（例如 {"$eq": "value"}）
+                        if "$eq" in value:
+                            if doc_value != value["$eq"]:
+                                matches = False
+                                break
+                        else:
+                            # 其他運算符可以在此擴展
+                            matches = False
+                            break
+                    elif isinstance(value, str) and isinstance(doc_value, str):
+                        # 字串匹配：支援部分匹配（包含）
+                        if value.lower() not in doc_value.lower():
+                            matches = False
+                            break
+                    else:
+                        # 其他類型使用精確匹配
+                        if doc_value != value:
+                            matches = False
+                            break
+                if matches:
+                    filtered_results.append((doc, distance_score))
+            # 只保留前 top_k 個結果
+            results_with_scores = filtered_results[:top_k]
+        else:
+            # 沒有過濾條件，直接獲取結果
+            results_with_scores = self.vectorstore.similarity_search_with_score(
+                query,
+                k=top_k
+            )
+        # 構建結果並轉換分數
+        results = []
+        for doc, distance_score in results_with_scores:
+            # 因為 embedding 已正規化，L2 距離的平方為 2 - 2 * cos_sim
+            # -> cos_sim = 1 - (distance^2 / 2)
+            # 分數範圍在 [0, 1] 之間，越高越相似
+            similarity_score = 1 - (distance_score**2 / 2)
+            results.append({
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "score": float(similarity_score),
+            })
+        return results

src/step_back_rag.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+Step-back Prompting 雙軌 RAG：結合具體事實與抽象原理
+使用 Step-back Prompting 技術，同時檢索具體事實和抽象原理，提升回答質量
+"""
+from typing import List, Dict, Optional
+from .retrievers.reranker import RAGPipeline
+from .retrievers.vector_retriever import VectorRetriever
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+import time
+import logging
+import hashlib
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logger = logging.getLogger(__name__)
+class StepBackRAG:
+    """使用 Step-back Prompting 的雙軌 RAG 系統"""
+    def __init__(
+        self,
+        rag_pipeline: RAGPipeline,
+        vector_retriever: VectorRetriever,
+        llm: OllamaLLM,
+        step_back_temperature: float = 0.3,  # 生成抽象問題時使用較低溫度
+        answer_temperature: float = 0.7,
+        enable_parallel: bool = True
+    ):
+        """
+        初始化 Step-back RAG
+        Args:
+            rag_pipeline: RAG 管線實例（用於最終答案生成）
+            vector_retriever: 向量檢索器
+            llm: LLM 實例
+            step_back_temperature: 生成抽象問題的溫度（較低，更穩定）
+            answer_temperature: 生成答案的溫度
+            enable_parallel: 是否並行執行雙軌檢索
+        """
+        self.rag_pipeline = rag_pipeline
+        self.vector_retriever = vector_retriever
+        self.llm = llm
+        self.step_back_temperature = step_back_temperature
+        self.answer_temperature = answer_temperature
+        self.enable_parallel = enable_parallel
+    def _generate_step_back_question(self, question: str) -> str:
+        """
+        生成 Step-back 抽象問題
+        Args:
+            question: 原始具體問題
+        Returns:
+            抽象問題
+        """
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""你是一個資深專家。請將以下具體問題轉換為一個更抽象、更基礎的原理性問題。
+這個抽象問題應該幫助理解該領域的基礎概念和原理，而不是直接回答具體問題。
+具體問題: {question}
+請生成一個抽象問題，用於檢索相關的原理和背景知識：
+"""
+        else:
+            prompt = f"""You are a senior expert. Please convert the following specific question into a more abstract, fundamental question about principles and concepts.
+This abstract question should help understand the basic concepts and principles in this field, rather than directly answering the specific question.
+Specific question: {question}
+Please generate an abstract question for retrieving relevant principles and background knowledge:
+"""
+        try:
+            abstract_question = self.llm.generate(
+                prompt=prompt,
+                temperature=self.step_back_temperature,
+                max_tokens=200
+            )
+            abstract_question = abstract_question.strip()
+            if not abstract_question:
+                logger.warning("⚠️  生成的抽象問題為空，使用原始問題")
+                return question
+            logger.info(f"✅ 生成抽象問題: '{abstract_question}'")
+            return abstract_question
+        except Exception as e:
+            logger.error(f"⚠️  生成抽象問題時出錯: {e}")
+            return question
+    def _get_doc_id(self, doc: Dict) -> str:
+        """
+        生成文檔的唯一標識符
+        Args:
+            doc: 文檔字典
+        Returns:
+            唯一 ID
+        """
+        metadata = doc.get("metadata", {})
+        content = doc.get("content", "")
+        if "arxiv_id" in metadata and "chunk_index" in metadata:
+            return f"{metadata['arxiv_id']}_{metadata['chunk_index']}"
+        elif "file_path" in metadata and "chunk_index" in metadata:
+            return f"{metadata['file_path']}_{metadata['chunk_index']}"
+        else:
+            content_hash = hashlib.md5(content.encode()).hexdigest()[:16]
+            return f"doc_{content_hash}"
+    def _retrieve_direct(self, question: str, top_k: int, metadata_filter: Optional[Dict] = None) -> List[Dict]:
+        """直接檢索原始問題（具體事實）"""
+        return self.vector_retriever.retrieve(
+            query=question,
+            top_k=top_k,
+            metadata_filter=metadata_filter
+        )
+    def _retrieve_step_back(self, question: str, top_k: int, metadata_filter: Optional[Dict] = None) -> tuple:
+        """Step-back 檢索（抽象原理）"""
+        abstract_question = self._generate_step_back_question(question)
+        results = self.vector_retriever.retrieve(
+            query=abstract_question,
+            top_k=top_k,
+            metadata_filter=metadata_filter
+        )
+        return results, abstract_question
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        return_abstract_question: bool = False
+    ) -> Dict:
+        """
+        執行雙軌檢索（不生成答案）
+        Args:
+            question: 原始問題
+            top_k: 每軌返回的結果數量
+            metadata_filter: 可選的 metadata 過濾條件
+            return_abstract_question: 是否返回抽象問題
+        Returns:
+            包含雙軌檢索結果的字典
+        """
+        start_time = time.time()
+        if self.enable_parallel:
+            # 並行執行雙軌檢索
+            logger.info(f"🔄 並行執行雙軌檢索: '{question}'")
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                direct_future = executor.submit(
+                    self._retrieve_direct, question, top_k, metadata_filter
+                )
+                step_back_future = executor.submit(
+                    self._retrieve_step_back, question, top_k, metadata_filter
+                )
+                specific_results = direct_future.result()
+                abstract_results, abstract_question = step_back_future.result()
+        else:
+            # 串行執行
+            logger.info(f"🔄 串行執行雙軌檢索: '{question}'")
+            specific_results = self._retrieve_direct(question, top_k, metadata_filter)
+            abstract_results, abstract_question = self._retrieve_step_back(question, top_k, metadata_filter)
+        elapsed_time = time.time() - start_time
+        logger.info(
+            f"✅ 雙軌檢索完成（耗時: {elapsed_time:.2f}s）\n"
+            f"   具體事實: {len(specific_results)} 個結果\n"
+            f"   抽象原理: {len(abstract_results)} 個結果"
+        )
+        return {
+            "specific_context": specific_results,
+            "abstract_context": abstract_results,
+            "abstract_question": abstract_question if return_abstract_question else None,
+            "question": question,
+            "elapsed_time": elapsed_time
+        }
+    def generate_answer(
+        self,
+        question: str,
+        formatter: PromptFormatter,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        document_type: str = "general",
+        return_abstract_question: bool = False
+    ) -> Dict:
+        """
+        完整的 Step-back RAG 流程：雙軌檢索 -> 生成答案
+        Args:
+            question: 原始問題
+            formatter: Prompt 格式化器
+            top_k: 每軌用於生成答案的文檔數量
+            metadata_filter: 可選的 metadata 過濾條件
+            document_type: 文檔類型 ("paper", "cv", "general")
+            return_abstract_question: 是否返回抽象問題
+        Returns:
+            包含檢索結果、生成的答案和統計資訊的字典
+        """
+        start_time = time.time()
+        # 第一步：雙軌檢索
+        retrieval_result = self.query(
+            question=question,
+            top_k=top_k,
+            metadata_filter=metadata_filter,
+            return_abstract_question=return_abstract_question
+        )
+        specific_results = retrieval_result["specific_context"]
+        abstract_results = retrieval_result["abstract_context"]
+        if not specific_results and not abstract_results:
+            return {
+                **retrieval_result,
+                "answer": "抱歉，未找到相關文檔來回答此問題。",
+                "formatted_context": None,
+                "answer_time": 0.0,
+                "total_time": retrieval_result["elapsed_time"]
+            }
+        # 第二步：格式化雙軌上下文
+        specific_context = formatter.format_context(
+            specific_results,
+            document_type=document_type
+        ) if specific_results else "未找到相關的具體事實資料。"
+        abstract_context = formatter.format_context(
+            abstract_results,
+            document_type=document_type
+        ) if abstract_results else "未找到相關的基礎原理資料。"
+        # 第三步：創建融合提示詞（關鍵步驟）
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            final_prompt = f"""你是一個資深專家。請結合以下兩類資訊來回答使用者的具體問題。
+【基礎原理與背景】
+{abstract_context}
+【具體事實資料】
+{specific_context}
+使用者問題：{question}
+請根據原理推導並結合事實，給出一個專業且具備邏輯的回答：
+"""
+        else:
+            final_prompt = f"""You are a senior expert. Please answer the user's specific question by combining the following two types of information.
+【Fundamental Principles and Background】
+{abstract_context}
+【Specific Facts and Data】
+{specific_context}
+User question: {question}
+Please provide a professional and logical answer based on principles and facts:
+"""
+        # 第四步：生成回答
+        logger.info("🤖 生成回答中...")
+        answer_start = time.time()
+        try:
+            answer = self.llm.generate(
+                prompt=final_prompt,
+                temperature=self.answer_temperature,
+                max_tokens=2048
+            )
+            answer_time = time.time() - answer_start
+            logger.info(f"✅ 回答生成完成（耗時: {answer_time:.2f}s）")
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            answer = f"生成回答時出錯: {e}"
+            answer_time = time.time() - answer_start
+        total_time = time.time() - start_time
+        return {
+            **retrieval_result,
+            "answer": answer,
+            "formatted_context": {
+                "specific": specific_context,
+                "abstract": abstract_context
+            },
+            "answer_time": answer_time,
+            "total_time": total_time
+        }

src/subquery_rag.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+Sub-query Decomposition RAG：將複雜問題拆解成子問題後檢索
+"""
+from typing import List, Dict, Optional
+from .retrievers.reranker import RAGPipeline
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+import hashlib
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logger = logging.getLogger(__name__)
+class SubQueryDecompositionRAG:
+    """使用子問題拆解的 RAG 系統"""
+    def __init__(
+        self,
+        rag_pipeline: RAGPipeline,
+        llm: OllamaLLM,
+        max_sub_queries: int = 3,
+        top_k_per_subquery: int = 5,
+        enable_parallel: bool = True
+    ):
+        """
+        初始化 Sub-query Decomposition RAG
+        Args:
+            rag_pipeline: 現有的 RAG 管線實例
+            llm: LLM 實例（用於生成子問題）
+            max_sub_queries: 最多生成的子問題數量
+            top_k_per_subquery: 每個子問題檢索的結果數量
+            enable_parallel: 是否並行處理子查詢
+        """
+        self.rag_pipeline = rag_pipeline
+        self.llm = llm
+        self.max_sub_queries = max_sub_queries
+        self.top_k_per_subquery = top_k_per_subquery
+        self.enable_parallel = enable_parallel
+    def _generate_sub_queries(self, question: str) -> List[str]:
+        """
+        將原始問題拆解成子問題
+        Args:
+            question: 原始問題
+        Returns:
+            子問題列表
+        """
+        # 檢測語言
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""你是一個專業助理。請將以下原始問題拆解成最多 {self.max_sub_queries} 個具體的子問題，以便進行資料搜尋。
+每個子問題應專注於原始問題的一個特定面向。請以換行符號分隔問題。
+原始問題: {question}
+子問題清單:"""
+        else:
+            prompt = f"""You are a professional assistant. Please decompose the following original question into at most {self.max_sub_queries} specific sub-questions for information retrieval.
+Each sub-question should focus on a specific aspect of the original question. Please separate questions with newlines.
+Original question: {question}
+Sub-question list:"""
+        try:
+            response = self.llm.generate(
+                prompt=prompt,
+                temperature=0.3,  # 降低溫度以獲得更穩定的結果
+                max_tokens=500
+            )
+            # 解析子問題
+            sub_queries = [
+                q.strip()
+                for q in response.strip().split("\n")
+                if q.strip() and not q.strip().startswith("#")
+            ]
+            # 移除編號前綴（如 "1. ", "1) " 等）
+            cleaned_queries = []
+            for q in sub_queries:
+                # 移除開頭的編號
+                q = q.lstrip("0123456789. )")
+                q = q.strip()
+                if q:
+                    cleaned_queries.append(q)
+            # 限制數量
+            cleaned_queries = cleaned_queries[:self.max_sub_queries]
+            # 如果沒有生成子問題，使用原始問題
+            if not cleaned_queries:
+                logger.warning("⚠️  未生成子問題，使用原始問題")
+                cleaned_queries = [question]
+            return cleaned_queries
+        except Exception as e:
+            logger.error(f"⚠️  生成子問題時出錯: {e}")
+            # 回退到原始問題
+            return [question]
+    def _get_doc_id(self, doc: Dict) -> str:
+        """
+        生成文檔的唯一標識符
+        Args:
+            doc: 文檔字典
+        Returns:
+            唯一 ID
+        """
+        metadata = doc.get("metadata", {})
+        content = doc.get("content", "")
+        # 使用 metadata 中的唯一標識（如果有的話）
+        if "arxiv_id" in metadata and "chunk_index" in metadata:
+            return f"{metadata['arxiv_id']}_{metadata['chunk_index']}"
+        elif "file_path" in metadata and "chunk_index" in metadata:
+            return f"{metadata['file_path']}_{metadata['chunk_index']}"
+        else:
+            # 回退到內容的 hash
+            content_hash = hashlib.md5(content.encode()).hexdigest()[:16]
+            return f"doc_{content_hash}"
+    def _retrieve_for_subquery(
+        self,
+        sub_query: str,
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        針對單個子問題進行檢索
+        Args:
+            sub_query: 子問題
+            metadata_filter: 可選的 metadata 過濾條件
+        Returns:
+            檢索結果列表
+        """
+        try:
+            results = self.rag_pipeline.query(
+                text=sub_query,
+                top_k=self.top_k_per_subquery,
+                metadata_filter=metadata_filter,
+                enable_rerank=True
+            )
+            return results
+        except Exception as e:
+            logger.error(f"⚠️  檢索子問題 '{sub_query}' 時出錯: {e}")
+            return []
+    def _get_unique_documents(
+        self,
+        sub_queries: List[str],
+        metadata_filter: Optional[Dict] = None
+    ) -> List[Dict]:
+        """
+        針對所有子問題進行檢索，並移除重複的檔案
+        Args:
+            sub_queries: 子問題列表
+            metadata_filter: 可選的 metadata 過濾條件
+        Returns:
+            去重後的文檔列表
+        """
+        unique_docs = {}
+        if self.enable_parallel and len(sub_queries) > 1:
+            # 並行處理子查詢
+            logger.info(f"🔄 並行處理 {len(sub_queries)} 個子查詢...")
+            with ThreadPoolExecutor(max_workers=min(len(sub_queries), 5)) as executor:
+                future_to_query = {
+                    executor.submit(self._retrieve_for_subquery, q, metadata_filter): q
+                    for q in sub_queries
+                }
+                for future in as_completed(future_to_query):
+                    sub_query = future_to_query[future]
+                    try:
+                        docs = future.result()
+                        logger.debug(f"✅ 子問題 '{sub_query}' 找到 {len(docs)} 個結果")
+                        for doc in docs:
+                            doc_id = self._get_doc_id(doc)
+                            if doc_id not in unique_docs:
+                                unique_docs[doc_id] = doc
+                            else:
+                                # 如果已存在，保留分數更高的
+                                existing_score = unique_docs[doc_id].get(
+                                    'rerank_score',
+                                    unique_docs[doc_id].get('hybrid_score', 0)
+                                )
+                                new_score = doc.get(
+                                    'rerank_score',
+                                    doc.get('hybrid_score', 0)
+                                )
+                                if new_score > existing_score:
+                                    unique_docs[doc_id] = doc
+                    except Exception as e:
+                        logger.error(f"⚠️  處理子問題 '{sub_query}' 時出錯: {e}")
+        else:
+            # 串行處理
+            logger.info(f"🔄 串行處理 {len(sub_queries)} 個子查詢...")
+            for sub_query in sub_queries:
+                docs = self._retrieve_for_subquery(sub_query, metadata_filter)
+                logger.debug(f"✅ 子問題 '{sub_query}' 找到 {len(docs)} 個結果")
+                for doc in docs:
+                    doc_id = self._get_doc_id(doc)
+                    if doc_id not in unique_docs:
+                        unique_docs[doc_id] = doc
+                    else:
+                        # 保留分數更高的
+                        existing_score = unique_docs[doc_id].get(
+                            'rerank_score',
+                            unique_docs[doc_id].get('hybrid_score', 0)
+                        )
+                        new_score = doc.get(
+                            'rerank_score',
+                            doc.get('hybrid_score', 0)
+                        )
+                        if new_score > existing_score:
+                            unique_docs[doc_id] = doc
+        # 按分數排序
+        result_list = list(unique_docs.values())
+        result_list.sort(
+            key=lambda x: x.get('rerank_score', x.get('hybrid_score', 0)),
+            reverse=True
+        )
+        return result_list
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        return_sub_queries: bool = False
+    ) -> Dict:
+        """
+        執行 Sub-query Decomposition RAG 查詢
+        Args:
+            question: 原始問題
+            top_k: 返回前 k 個結果
+            metadata_filter: 可選的 metadata 過濾條件
+            return_sub_queries: 是否在結果中包含子問題列表
+        Returns:
+            包含檢索結果和統計資訊的字典
+        """
+        start_time = time.time()
+        # 第一步：產生子問題
+        logger.info(f"🔍 拆解問題: '{question}'")
+        sub_queries = self._generate_sub_queries(question)
+        logger.info(f"✅ 生成 {len(sub_queries)} 個子問題:")
+        for i, sq in enumerate(sub_queries, 1):
+            logger.info(f"   {i}. {sq}")
+        # 第二步：檢索並去重
+        logger.info(f"📚 檢索相關文檔...")
+        docs = self._get_unique_documents(sub_queries, metadata_filter)
+        logger.info(f"✅ 找到 {len(docs)} 個唯一文檔（去重後）")
+        # 第三步：返回前 top_k 個結果
+        final_results = docs[:top_k]
+        elapsed_time = time.time() - start_time
+        result = {
+            "results": final_results,
+            "total_docs_found": len(docs),
+            "sub_queries": sub_queries if return_sub_queries else None,
+            "elapsed_time": elapsed_time
+        }
+        return result
+    def generate_answer(
+        self,
+        question: str,
+        formatter: PromptFormatter,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        document_type: str = "general",
+        return_sub_queries: bool = False
+    ) -> Dict:
+        """
+        完整的 Sub-query Decomposition RAG 流程：檢索 + 生成答案
+        Args:
+            question: 原始問題
+            formatter: Prompt 格式化器
+            top_k: 返回前 k 個結果用於生成答案
+            metadata_filter: 可選的 metadata 過濾條件
+            document_type: 文檔類型 ("paper", "cv", "general")
+            return_sub_queries: 是否在結果中包含子問題列表
+        Returns:
+            包含檢索結果、生成的答案和統計資訊的字典
+        """
+        # 檢索
+        retrieval_result = self.query(
+            question=question,
+            top_k=top_k,
+            metadata_filter=metadata_filter,
+            return_sub_queries=return_sub_queries
+        )
+        if not retrieval_result["results"]:
+            return {
+                **retrieval_result,
+                "answer": "抱歉，未找到相關文檔來回答此問題。",
+                "formatted_context": None
+            }
+        # 格式化上下文
+        formatted_context = formatter.format_context(
+            retrieval_result["results"],
+            document_type=document_type
+        )
+        # 創建 prompt
+        prompt = formatter.create_prompt(
+            question,
+            formatted_context,
+            document_type=document_type
+        )
+        # 生成回答
+        logger.info("🤖 生成回答中...")
+        answer_start = time.time()
+        try:
+            answer = self.llm.generate(
+                prompt=prompt,
+                temperature=0.7,
+                max_tokens=2048
+            )
+            answer_time = time.time() - answer_start
+            logger.info(f"✅ 回答生成完成（耗時: {answer_time:.2f}s）")
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            answer = f"生成回答時出錯: {e}"
+            answer_time = time.time() - answer_start
+        return {
+            **retrieval_result,
+            "answer": answer,
+            "formatted_context": formatted_context,
+            "answer_time": answer_time,
+            "total_time": retrieval_result["elapsed_time"] + answer_time
+        }

src/triple_hybrid_rag.py ADDED Viewed

	@@ -0,0 +1,467 @@

+"""
+Triple Hybrid RAG：融合 SubQuery + HyDE + Step-back Prompting
+結合三種技術的優勢，實現最強大的 RAG 系統
+"""
+from typing import List, Dict, Optional
+from .retrievers.reranker import RAGPipeline
+from .retrievers.vector_retriever import VectorRetriever
+from .prompt_formatter import PromptFormatter
+from .llm_integration import OllamaLLM
+import hashlib
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logger = logging.getLogger(__name__)
+class TripleHybridRAG:
+    """融合 SubQuery + HyDE + Step-back 的三重混合 RAG 系統"""
+    def __init__(
+        self,
+        rag_pipeline: RAGPipeline,
+        vector_retriever: VectorRetriever,
+        llm: OllamaLLM,
+        max_sub_queries: int = 3,
+        top_k_per_subquery: int = 5,
+        hypothetical_length: int = 200,
+        temperature_subquery: float = 0.3,
+        temperature_hyde: float = 0.7,
+        temperature_stepback: float = 0.3,
+        answer_temperature: float = 0.7,
+        enable_parallel: bool = True
+    ):
+        """
+        初始化三重混合 RAG
+        Args:
+            rag_pipeline: RAG 管線實例
+            vector_retriever: 向量檢索器
+            llm: LLM 實例
+            max_sub_queries: 最多生成的子問題數量
+            top_k_per_subquery: 每個子問題檢索的結果數量
+            hypothetical_length: 假設性文檔目標長度（字符數）
+            temperature_subquery: 生成子問題的溫度（較低，更穩定）
+            temperature_hyde: 生成假設性文檔的溫度（較高，更多專業術語）
+            temperature_stepback: 生成抽象問題的溫度（較低，更穩定）
+            answer_temperature: 生成答案的溫度
+            enable_parallel: 是否並行處理
+        """
+        self.rag_pipeline = rag_pipeline
+        self.vector_retriever = vector_retriever
+        self.llm = llm
+        self.max_sub_queries = max_sub_queries
+        self.top_k_per_subquery = top_k_per_subquery
+        self.hypothetical_length = hypothetical_length
+        self.temperature_subquery = temperature_subquery
+        self.temperature_hyde = temperature_hyde
+        self.temperature_stepback = temperature_stepback
+        self.answer_temperature = answer_temperature
+        self.enable_parallel = enable_parallel
+    def _generate_sub_queries(self, question: str) -> List[str]:
+        """生成子問題（SubQuery）"""
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""你是一個專業助理。請將以下原始問題拆解成最多 {self.max_sub_queries} 個具體的子問題，以便進行資料搜尋。
+每個子問題應專注於原始問題的一個特定面向。請以換行符號分隔問題。
+原始問題: {question}
+子問題清單:"""
+        else:
+            prompt = f"""You are a professional assistant. Please decompose the following original question into at most {self.max_sub_queries} specific sub-questions for information retrieval.
+Each sub-question should focus on a specific aspect of the original question. Please separate questions with newlines.
+Original question: {question}
+Sub-question list:"""
+        try:
+            response = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature_subquery,
+                max_tokens=500
+            )
+            sub_queries = [
+                q.strip()
+                for q in response.strip().split("\n")
+                if q.strip() and not q.strip().startswith("#")
+            ]
+            # 移除編號前綴
+            cleaned_queries = []
+            for q in sub_queries:
+                q = q.lstrip("0123456789. )")
+                q = q.strip()
+                if q:
+                    cleaned_queries.append(q)
+            cleaned_queries = cleaned_queries[:self.max_sub_queries]
+            if not cleaned_queries:
+                logger.warning("⚠️  未生成子問題，使用原始問題")
+                cleaned_queries = [question]
+            return cleaned_queries
+        except Exception as e:
+            logger.error(f"⚠️  生成子問題時出錯: {e}")
+            return [question]
+    def _generate_hypothetical_document(self, sub_query: str) -> str:
+        """為子問題生成假設性文檔（HyDE）"""
+        is_chinese = PromptFormatter.detect_language(sub_query) == "zh"
+        if is_chinese:
+            prompt = f"""請針對以下問題，寫出一段約 {self.hypothetical_length} 字的專業技術檔案內容。
+這段內容應包含該領域常見的專業術語與原理說明，以便用於後續的語義檢索。
+請使用專業的術語和概念，即使你對某些細節不確定，也要包含相關的專業詞彙。
+問題: {sub_query}
+專業��術內容："""
+        else:
+            prompt = f"""Please write a professional technical document of approximately {self.hypothetical_length} words in response to the following question.
+This content should include common professional terminology and principle explanations in this field, to be used for subsequent semantic retrieval.
+Please use professional terms and concepts, and include relevant professional vocabulary even if you are uncertain about some details.
+Question: {sub_query}
+Professional technical content:"""
+        try:
+            hypothetical_doc = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature_hyde,
+                max_tokens=500
+            )
+            hypothetical_doc = hypothetical_doc.strip()
+            if not hypothetical_doc:
+                logger.warning(f"⚠️  子問題 '{sub_query}' 的假設性文檔為空，使用子問題本身")
+                return sub_query
+            return hypothetical_doc
+        except Exception as e:
+            logger.error(f"⚠️  生成假設性文檔時出錯: {e}")
+            return sub_query
+    def _generate_step_back_question(self, question: str) -> str:
+        """生成 Step-back 抽象問題"""
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            prompt = f"""你是一個資深專家。請將以下具體問題轉換為一個更抽象、更基礎的原理性問題。
+這個抽象問題應該幫助理解該領域的基礎概念和原理，而不是直接回答具體問題。
+具體問題: {question}
+請生成一個抽象問題，用於檢索相關的原理和背景知識：
+"""
+        else:
+            prompt = f"""You are a senior expert. Please convert the following specific question into a more abstract, fundamental question about principles and concepts.
+This abstract question should help understand the basic concepts and principles in this field, rather than directly answering the specific question.
+Specific question: {question}
+Please generate an abstract question for retrieving relevant principles and background knowledge:
+"""
+        try:
+            abstract_question = self.llm.generate(
+                prompt=prompt,
+                temperature=self.temperature_stepback,
+                max_tokens=200
+            )
+            abstract_question = abstract_question.strip()
+            if not abstract_question:
+                logger.warning("⚠️  生成的抽象問題為空，使用原始問題")
+                return question
+            return abstract_question
+        except Exception as e:
+            logger.error(f"⚠️  生成抽象問題時出錯: {e}")
+            return question
+    def _get_doc_id(self, doc: Dict) -> str:
+        """生成文檔的唯一標識符"""
+        metadata = doc.get("metadata", {})
+        content = doc.get("content", "")
+        if "arxiv_id" in metadata and "chunk_index" in metadata:
+            return f"{metadata['arxiv_id']}_{metadata['chunk_index']}"
+        elif "file_path" in metadata and "chunk_index" in metadata:
+            return f"{metadata['file_path']}_{metadata['chunk_index']}"
+        else:
+            content_hash = hashlib.md5(content.encode()).hexdigest()[:16]
+            return f"doc_{content_hash}"
+    def _process_subquery_with_hyde(
+        self,
+        sub_query: str,
+        metadata_filter: Optional[Dict] = None
+    ) -> tuple:
+        """處理單個子問題：生成假設性文檔並檢索"""
+        try:
+            hypothetical_doc = self._generate_hypothetical_document(sub_query)
+            results = self.vector_retriever.retrieve(
+                query=hypothetical_doc,
+                top_k=self.top_k_per_subquery,
+                metadata_filter=metadata_filter
+            )
+            return results, hypothetical_doc
+        except Exception as e:
+            logger.error(f"⚠️  處理子問題 '{sub_query}' 時出錯: {e}")
+            return [], ""
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        return_sub_queries: bool = False,
+        return_hypothetical: bool = False,
+        return_abstract_question: bool = False
+    ) -> Dict:
+        """
+        執行三重混合 RAG 檢索
+        流程：
+        1. 拆解成子問題（SubQuery）
+        2. 對每個子問題生成假設性文檔並檢索（HyDE）
+        3. 直接檢索原始問題（具體事實）
+        4. 生成抽象問題並檢索（Step-back，抽象原理）
+        5. 合併所有結果並去重
+        """
+        start_time = time.time()
+        # 第一步：生成子問題
+        logger.info(f"🔍 [SubQuery] 拆解問題: '{question}'")
+        sub_queries = self._generate_sub_queries(question)
+        logger.info(f"✅ 生成 {len(sub_queries)} 個子問題")
+        # 第二步：為每個子問題生成假設性文檔並檢索（HyDE）
+        logger.info(f"📚 [HyDE] 為每個子問題生成假設性文檔並檢索...")
+        subquery_results = []
+        hypothetical_docs = {}
+        if self.enable_parallel and len(sub_queries) > 1:
+            with ThreadPoolExecutor(max_workers=min(len(sub_queries), 5)) as executor:
+                future_to_query = {
+                    executor.submit(self._process_subquery_with_hyde, sq, metadata_filter): sq
+                    for sq in sub_queries
+                }
+                for future in as_completed(future_to_query):
+                    sub_query = future_to_query[future]
+                    try:
+                        results, hypo_doc = future.result()
+                        hypothetical_docs[sub_query] = hypo_doc
+                        subquery_results.extend(results)
+                    except Exception as e:
+                        logger.error(f"⚠️  處理子問題 '{sub_query}' 時出錯: {e}")
+        else:
+            for sub_query in sub_queries:
+                results, hypo_doc = self._process_subquery_with_hyde(sub_query, metadata_filter)
+                hypothetical_docs[sub_query] = hypo_doc
+                subquery_results.extend(results)
+        # 第三步：Step-back 雙軌檢索
+        logger.info(f"🔍 [Step-back] 執行雙軌檢索...")
+        if self.enable_parallel:
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                direct_future = executor.submit(
+                    self.vector_retriever.retrieve,
+                    question, top_k, metadata_filter
+                )
+                abstract_question = self._generate_step_back_question(question)
+                step_back_future = executor.submit(
+                    self.vector_retriever.retrieve,
+                    abstract_question, top_k, metadata_filter
+                )
+                specific_results = direct_future.result()
+                abstract_results = step_back_future.result()
+        else:
+            specific_results = self.vector_retriever.retrieve(
+                query=question,
+                top_k=top_k,
+                metadata_filter=metadata_filter
+            )
+            abstract_question = self._generate_step_back_question(question)
+            abstract_results = self.vector_retriever.retrieve(
+                query=abstract_question,
+                top_k=top_k,
+                metadata_filter=metadata_filter
+            )
+        # 第四步：合併所有結果並去重
+        logger.info(f"🔄 合併並去重所有檢索結果...")
+        all_results = subquery_results + specific_results + abstract_results
+        unique_docs = {}
+        for doc in all_results:
+            doc_id = self._get_doc_id(doc)
+            if doc_id not in unique_docs:
+                unique_docs[doc_id] = doc
+            else:
+                # 保留分數更高的
+                existing_score = unique_docs[doc_id].get('score', 0)
+                new_score = doc.get('score', 0)
+                if new_score > existing_score:
+                    unique_docs[doc_id] = doc
+        # 排序並返回前 top_k
+        result_list = list(unique_docs.values())
+        result_list.sort(key=lambda x: x.get('score', 0), reverse=True)
+        final_results = result_list[:top_k]
+        elapsed_time = time.time() - start_time
+        logger.info(
+            f"✅ 三重混合檢索完成（耗時: {elapsed_time:.2f}s）\n"
+            f"   子問題檢索: {len(subquery_results)} 個結果\n"
+            f"   具體事實: {len(specific_results)} 個結果\n"
+            f"   抽象原理: {len(abstract_results)} 個結果\n"
+            f"   去重後總計: {len(result_list)} 個，返回前 {len(final_results)} 個"
+        )
+        return {
+            "results": final_results,
+            "total_docs_found": len(result_list),
+            "sub_queries": sub_queries if return_sub_queries else None,
+            "hypothetical_documents": hypothetical_docs if return_hypothetical else None,
+            "abstract_question": abstract_question if return_abstract_question else None,
+            "subquery_results": subquery_results,
+            "specific_context": specific_results,
+            "abstract_context": abstract_results,
+            "question": question,
+            "elapsed_time": elapsed_time
+        }
+    def generate_answer(
+        self,
+        question: str,
+        formatter: PromptFormatter,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict] = None,
+        document_type: str = "general",
+        return_sub_queries: bool = False,
+        return_hypothetical: bool = False,
+        return_abstract_question: bool = False
+    ) -> Dict:
+        """
+        完整的三重混合 RAG 流程：檢索 + 生成答案
+        """
+        start_time = time.time()
+        # 檢索
+        retrieval_result = self.query(
+            question=question,
+            top_k=top_k,
+            metadata_filter=metadata_filter,
+            return_sub_queries=return_sub_queries,
+            return_hypothetical=return_hypothetical,
+            return_abstract_question=return_abstract_question
+        )
+        if not retrieval_result["results"]:
+            return {
+                **retrieval_result,
+                "answer": "抱歉，未找到相關文檔來回答此問題。",
+                "formatted_context": None,
+                "answer_time": 0.0,
+                "total_time": retrieval_result["elapsed_time"]
+            }
+        # 格式化三類上下文
+        subquery_context = formatter.format_context(
+            retrieval_result["subquery_results"][:top_k],
+            document_type=document_type
+        ) if retrieval_result.get("subquery_results") else "未找到相關的子問題檢索結果。"
+        specific_context = formatter.format_context(
+            retrieval_result["specific_context"],
+            document_type=document_type
+        ) if retrieval_result.get("specific_context") else "未找到相關的具體事實資料。"
+        abstract_context = formatter.format_context(
+            retrieval_result["abstract_context"],
+            document_type=document_type
+        ) if retrieval_result.get("abstract_context") else "未找到相關的基礎原理資料。"
+        # 創建融合提示詞（關鍵步驟）
+        is_chinese = PromptFormatter.detect_language(question) == "zh"
+        if is_chinese:
+            final_prompt = f"""你是一個資深專家。請結合以下三類資訊來回答使用者的具體問題。
+【基礎原理與背景】（來自 Step-back 抽象問題檢索）
+{abstract_context}
+【具體事實資料】（來自直接問題檢索）
+{specific_context}
+【子問題相關資料】（來自 SubQuery + HyDE 檢索）
+{subquery_context}
+使用者問題：{question}
+請根據原理推導、結合具體事實，並參考子問題的相關資料，給出一個專業、全面且具備邏輯的回答：
+"""
+        else:
+            final_prompt = f"""You are a senior expert. Please answer the user's specific question by combining the following three types of information.
+【Fundamental Principles and Background】(from Step-back abstract question retrieval)
+{abstract_context}
+【Specific Facts and Data】(from direct question retrieval)
+{specific_context}
+【Sub-question Related Information】(from SubQuery + HyDE retrieval)
+{subquery_context}
+User question: {question}
+Please provide a professional, comprehensive, and logical answer based on principles, facts, and sub-question related information:
+"""
+        # 生成回答
+        logger.info("🤖 生成回答中...")
+        answer_start = time.time()
+        try:
+            answer = self.llm.generate(
+                prompt=final_prompt,
+                temperature=self.answer_temperature,
+                max_tokens=2048
+            )
+            answer_time = time.time() - answer_start
+            logger.info(f"✅ 回答生成完成（耗時: {answer_time:.2f}s）")
+        except Exception as e:
+            logger.error(f"❌ 生成回答時出錯: {e}")
+            answer = f"生成回答時出錯: {e}"
+            answer_time = time.time() - answer_start
+        total_time = time.time() - start_time
+        return {
+            **retrieval_result,
+            "answer": answer,
+            "formatted_context": {
+                "subquery": subquery_context,
+                "specific": specific_context,
+                "abstract": abstract_context
+            },
+            "answer_time": answer_time,
+            "total_time": total_time
+        }