aelsaeed commited on
Commit
1c4ea2c
·
verified ·
1 Parent(s): 9010851

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -57
app.py CHANGED
@@ -1,59 +1,165 @@
1
- import joblib
2
  import gradio as gr
 
3
  import numpy as np
4
- from sentence_transformers import SentenceTransformer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
-
7
- # =========================
8
- # تحميل البيانات
9
- # =========================
10
- books = joblib.load("books.pkl")
11
- theses = joblib.load("theses.pkl")
12
-
13
- books_emb = joblib.load("books_embeddings.pkl")
14
- theses_emb = joblib.load("theses_embeddings.pkl")
15
-
16
- # =========================
17
- # تحميل الموديل المحلي
18
- # =========================
19
- model = SentenceTransformer("AI_Library_Model")
20
-
21
- # =========================
22
- # دمج البيانات
23
- # =========================
24
- all_embeddings = np.vstack([books_emb, theses_emb])
25
- all_texts = books + theses
26
-
27
- # =========================
28
- # دالة البحث
29
- # =========================
30
- def search_library(query, top_k=5):
31
- if not query.strip():
32
- return "من فضلك اكتب سؤالًا"
33
-
34
- query_emb = model.encode(query)
35
-
36
- scores = cosine_similarity([query_emb], all_embeddings)[0]
37
- top_indices = scores.argsort()[-top_k:][::-1]
38
-
39
- results = []
40
- for i in top_indices:
41
- results.append(f"- {all_texts[i]}")
42
-
43
- return "\n".join(results)
44
-
45
- # =========================
46
- # واجهة Gradio
47
- # =========================
48
- demo = gr.Interface(
49
- fn=search_library,
50
- inputs=gr.Textbox(
51
- label="اكتب سؤالك",
52
- placeholder="مثال: organic chemistry synthesis"
53
- ),
54
- outputs=gr.Textbox(label="النتائج"),
55
- title="📚 AI Library Explorer",
56
- description="بحث ذكي في الكتب والرسائل العلمية (عربي / إنجليزي)"
57
- )
58
-
59
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================== imports ==================
2
  import gradio as gr
3
+ import pandas as pd
4
  import numpy as np
5
+ import os, pickle, tempfile
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import gdown
8
+
9
+ # ================== إعدادات ==================
10
+ BOOKS_FILE = "book.xlsx"
11
+ THESES_FILE = "theses.xlsx"
12
+
13
+ DRIVE_BOOKS_ID = "1FElHiASfiVLeuHWYaqd2Q5foxWRlJT-O"
14
+ DRIVE_THESES_ID = "1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv"
15
+
16
+ EMB_DIR = "embeddings"
17
+ os.makedirs(EMB_DIR, exist_ok=True)
18
+
19
+ MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
20
+ model = SentenceTransformer(MODEL_NAME)
21
+
22
+ # ================== تحميل من Drive ==================
23
+ def download_from_drive(file_id, output):
24
+ if not os.path.exists(output):
25
+ url = f"https://drive.google.com/uc?id={file_id}"
26
+ try:
27
+ gdown.download(url, output, quiet=True)
28
+ except Exception:
29
+ pass
30
+
31
+ download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
32
+ download_from_drive(DRIVE_THESES_ID, THESES_FILE)
33
+
34
+ # ================== تحميل ودمج الملفات ==================
35
+ def load_and_merge():
36
+ if not os.path.exists(BOOKS_FILE) or not os.path.exists(THESES_FILE):
37
+ raise FileNotFoundError("❌ تأكدي من وجود book.xlsx و theses.xlsx")
38
+
39
+ books = pd.read_excel(BOOKS_FILE).fillna("")
40
+ theses = pd.read_excel(THESES_FILE).fillna("")
41
+
42
+ # توحيد عمود العنوان
43
+ def normalize_title(df):
44
+ if "Title" not in df.columns:
45
+ if "العنوان" in df.columns:
46
+ df["Title"] = df["العنوان"].astype(str)
47
+ else:
48
+ df["Title"] = df.iloc[:, 0].astype(str)
49
+ return df
50
+
51
+ books = normalize_title(books)
52
+ theses = normalize_title(theses)
53
+
54
+ # إضافة نوع المصدر
55
+ books["المصدر"] = "كتاب"
56
+ theses["المصدر"] = "رسالة"
57
+
58
+ # دمج
59
+ merged = pd.concat([books, theses], ignore_index=True)
60
+ return merged
61
+
62
+ library_df = load_and_merge()
63
+
64
+ # ================== Embeddings ==================
65
+ def emb_path(name):
66
+ return os.path.join(EMB_DIR, f"{name}.pkl")
67
+
68
+ def build_or_load_embeddings(df, name):
69
+ path = emb_path(name)
70
+ if os.path.exists(path):
71
+ with open(path, "rb") as f:
72
+ emb = pickle.load(f)
73
+ if len(emb) == len(df):
74
+ return emb
75
+
76
+ texts = df["Title"].astype(str).tolist()
77
+ emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
78
+ with open(path, "wb") as f:
79
+ pickle.dump(emb, f)
80
+ return emb
81
+
82
+ library_embeddings = build_or_load_embeddings(library_df, "library")
83
+
84
+ # ================== عرض النتائج HTML ==================
85
+ def results_to_html(df):
86
+ return df.to_html(index=False, escape=False)
87
+
88
+ # ================== البحث ==================
89
+ def local_search_df(query, mode, source_filter):
90
+ if not query or not query.strip():
91
+ return "<p>⚠️ اكتب كلمة أو جملة للبحث</p>", pd.DataFrame()
92
+
93
+ df_search = library_df.copy()
94
+
95
+ # فلترة حسب المصدر
96
+ if source_filter != "الكل":
97
+ df_search = df_search[df_search["المصدر"] == source_filter]
98
+
99
+ # ---- بحث نصي ----
100
+ if mode == "نصي":
101
+ col = "العنوان" if "العنوان" in df_search.columns else "Title"
102
+ df = df_search[df_search[col].astype(str).str.contains(query, case=False, na=False)]
103
+
104
+ # ---- بحث دلالي ----
105
+ else:
106
+ q_emb = model.encode([query], convert_to_numpy=True)
107
+ scores = util.cos_sim(q_emb, library_embeddings)[0].cpu().numpy()
108
+ df_search = df_search.copy()
109
+ df_search["score"] = scores
110
+ df = df_search.sort_values("score", ascending=False)
111
+
112
+ if df.empty:
113
+ df = pd.DataFrame([{"نتيجة": "❌ لم يتم العثور على نتائج"}])
114
+
115
+ if "Title" in df.columns:
116
+ df = df.drop(columns=["Title"])
117
+
118
+ return results_to_html(df), df
119
+
120
+ # ================== حفظ النتائج Excel ==================
121
+ def save_to_excel(df):
122
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
123
+ if df is None or df.empty:
124
+ pd.DataFrame().to_excel(tmp.name, index=False)
125
+ else:
126
+ df.to_excel(tmp.name, index=False)
127
+ return tmp.name
128
+
129
+ # ================== الواجهة ==================
130
+ with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
131
+ gr.Markdown("## 🔍 البحث بالمكتبة (ملف موحد)")
132
+
133
+ query = gr.Textbox(label="اكتب كلمة أو موضوع البحث")
134
+
135
+ mode = gr.Radio(
136
+ ["نصي", "دلالي (Semantic)"],
137
+ value="نصي",
138
+ label="نوع البحث"
139
+ )
140
+
141
+ source_filter = gr.Radio(
142
+ ["الكل", "كتاب", "رسالة"],
143
+ value="الكل",
144
+ label="فلترة حسب المصدر"
145
+ )
146
+
147
+ btn_search = gr.Button("🔎 بحث")
148
+
149
+ df_state = gr.State()
150
+ output_html = gr.HTML()
151
+ file_out = gr.File(label="⬇️ تحميل النتائج")
152
+
153
+ btn_search.click(
154
+ local_search_df,
155
+ inputs=[query, mode, source_filter],
156
+ outputs=[output_html, df_state]
157
+ )
158
+
159
+ gr.Button("📥 حفظ النتائج").click(
160
+ save_to_excel,
161
+ inputs=df_state,
162
+ outputs=file_out
163
+ )
164
+
165
+ app.launch()