Spaces:

54justin
/

591Analyzer

Paused

App Files Files Community

54justin commited on Oct 8

Commit

e538e84

verified ·

1 Parent(s): 7ba1c88

Upload 11 files

Browse files

Files changed (7) hide show

591_rental_analysis.ipynb +914 -1
app.py +10 -179
data_generator.py +206 -0
gradio_app.py +347 -0
main.py +6 -0
rental_analyzer.py +287 -0
requirements.txt +7 -8

591_rental_analysis.ipynb CHANGED Viewed

@@ -1,5 +1,918 @@
 {
- "cells": [],
  "metadata": {
   "language_info": {
    "name": "python"

 {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "420f56b5",
+   "metadata": {},
+   "source": [
+    "# 591租屋網資料分析 - 高雄市鼓山區\n",
+    "## 由 Copilot 生成\n",
+    "\n",
+    "本筆記本將從591租屋網抓取高雄市鼓山區的租屋資料，並進行詳細的統計分析。\n",
+    "\n",
+    "**分析目標:**\n",
+    "- 目標區域：高雄市鼓山區\n",
+    "- 物件類型：2房、整層、電梯大樓\n",
+    "- 分析內容：租金分布、平均租金、中位數租金等統計資訊\n",
+    "- 整合：Hugging Face生態系統用於文字分析\n",
+    "\n",
+    "**資料來源:** https://rent.591.com.tw/list?region=17&section=247&kind=1&layout=2&shape=2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac100473",
+   "metadata": {},
+   "source": [
+    "## 1. 導入必要套件\n",
+    "首先導入所有需要的套件，包括網頁爬蟲、資料處理、視覺化和Hugging Face相關套件。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "515be3d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "# 導入基本套件\n",
+    "import requests\n",
+    "import time\n",
+    "import json\n",
+    "import re\n",
+    "from datetime import datetime\n",
+    "from typing import List, Dict, Optional\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# 網頁爬蟲相關\n",
+    "from bs4 import BeautifulSoup\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.chrome.service import Service\n",
+    "from selenium.webdriver.chrome.options import Options\n",
+    "from webdriver_manager.chrome import ChromeDriverManager\n",
+    "\n",
+    "# 資料處理\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# 視覺化\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "\n",
+    "# Hugging Face套件\n",
+    "try:\n",
+    "    from transformers import pipeline, AutoTokenizer, AutoModel\n",
+    "    from datasets import Dataset\n",
+    "    HF_AVAILABLE = True\n",
+    "    print(\"✅ Hugging Face套件載入成功\")\n",
+    "except ImportError:\n",
+    "    HF_AVAILABLE = False\n",
+    "    print(\"⚠️ Hugging Face套件未安裝，部分功能將無法使用\")\n",
+    "\n",
+    "# 設定中文字體\n",
+    "plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei', 'SimHei', 'Arial Unicode MS']\n",
+    "plt.rcParams['axes.unicode_minus'] = False\n",
+    "\n",
+    "# 設定顯示選項\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "pd.set_option('display.width', None)\n",
+    "\n",
+    "print(\"📦 套件載入完成！\")\n",
+    "print(f\"🐍 Python版本: {sys.version}\")\n",
+    "print(f\"🐼 Pandas版本: {pd.__version__}\")\n",
+    "print(f\"📊 Matplotlib版本: {plt.matplotlib.__version__}\")\n",
+    "print(f\"🤗 Hugging Face可用: {'是' if HF_AVAILABLE else '否'}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9040f987",
+   "metadata": {},
+   "source": [
+    "## 2. 設定爬蟲參數\n",
+    "定義目標網站URL、請求標頭和搜尋參數。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c93f46fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "# 基本設定\n",
+    "BASE_URL = \"https://rent.591.com.tw\"\n",
+    "TARGET_URL = \"https://rent.591.com.tw/list\"\n",
+    "\n",
+    "# 搜尋參數\n",
+    "SEARCH_PARAMS = {\n",
+    "    'region': '17',      # 高雄市\n",
+    "    'section': '247',    # 鼓山區\n",
+    "    'kind': '1',         # 整層住家\n",
+    "    'layout': '2',       # 2房\n",
+    "    'shape': '2'         # 電梯大樓\n",
+    "}\n",
+    "\n",
+    "# 請求標頭\n",
+    "HEADERS = {\n",
+    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',\n",
+    "    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',\n",
+    "    'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',\n",
+    "    'Accept-Encoding': 'gzip, deflate, br',\n",
+    "    'Connection': 'keep-alive',\n",
+    "    'Upgrade-Insecure-Requests': '1',\n",
+    "}\n",
+    "\n",
+    "print(\"🔧 爬蟲參數設定完成\")\n",
+    "print(f\"📍 目標區域: 高雄市鼓山區\")\n",
+    "print(f\"🏠 搜尋條件: {SEARCH_PARAMS}\")\n",
+    "print(f\"🌐 目標網站: {BASE_URL}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17c20d4e",
+   "metadata": {},
+   "source": [
+    "## 3. 實作網頁爬蟲函數\n",
+    "建立爬蟲類別和相關函數來處理HTTP請求、解析HTML內容和提取租屋資訊。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51273a88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "class Rent591Scraper:\n",
+    "    \"\"\"591租屋網爬蟲類別\"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.base_url = BASE_URL\n",
+    "        self.headers = HEADERS\n",
+    "        self.session = requests.Session()\n",
+    "        self.session.headers.update(self.headers)\n",
+    "        \n",
+    "    def setup_driver(self):\n",
+    "        \"\"\"設置Chrome WebDriver\"\"\"\n",
+    "        chrome_options = Options()\n",
+    "        chrome_options.add_argument('--headless')  # 無頭模式\n",
+    "        chrome_options.add_argument('--no-sandbox')\n",
+    "        chrome_options.add_argument('--disable-dev-shm-usage')\n",
+    "        chrome_options.add_argument('--disable-gpu')\n",
+    "        chrome_options.add_argument('--window-size=1920,1080')\n",
+    "        chrome_options.add_argument(f'--user-agent={self.headers[\"User-Agent\"]}')\n",
+    "        \n",
+    "        try:\n",
+    "            service = Service(ChromeDriverManager().install())\n",
+    "            driver = webdriver.Chrome(service=service, options=chrome_options)\n",
+    "            return driver\n",
+    "        except Exception as e:\n",
+    "            print(f\"⚠️ ChromeDriver設置失敗: {e}\")\n",
+    "            return None\n",
+    "    \n",
+    "    def extract_price(self, price_text: str) -> int:\n",
+    "        \"\"\"提取租金數字\"\"\"\n",
+    "        try:\n",
+    "            # 移除非數字字符，提取租金\n",
+    "            price_match = re.search(r'[\\d,]+', price_text.replace(',', ''))\n",
+    "            if price_match:\n",
+    "                return int(price_match.group().replace(',', ''))\n",
+    "        except:\n",
+    "            pass\n",
+    "        return 0\n",
+    "    \n",
+    "    def extract_area(self, info_text: str) -> float:\n",
+    "        \"\"\"提取坪數\"\"\"\n",
+    "        try:\n",
+    "            area_match = re.search(r'(\\d+(?:\\.\\d+)?)\\s*坪', info_text)\n",
+    "            if area_match:\n",
+    "                return float(area_match.group(1))\n",
+    "        except:\n",
+    "            pass\n",
+    "        return 0.0\n",
+    "    \n",
+    "    def extract_floor(self, info_text: str) -> str:\n",
+    "        \"\"\"提取樓層資訊\"\"\"\n",
+    "        try:\n",
+    "            floor_match = re.search(r'(\\d+)樓', info_text)\n",
+    "            if floor_match:\n",
+    "                return floor_match.group(1) + '樓'\n",
+    "        except:\n",
+    "            pass\n",
+    "        return \"N/A\"\n",
+    "    \n",
+    "    def parse_rental_item(self, item) -> Optional[Dict]:\n",
+    "        \"\"\"解析單筆租屋資訊\"\"\"\n",
+    "        try:\n",
+    "            # 基本資訊\n",
+    "            title_elem = item.find('h3') or item.find('.rent-item-title') or item.find('[class*=\"title\"]')\n",
+    "            title = title_elem.get_text(strip=True) if title_elem else \"N/A\"\n",
+    "            \n",
+    "            # 租金\n",
+    "            price_elem = item.find('.rent-item-price') or item.find('[class*=\"price\"]')\n",
+    "            price_text = price_elem.get_text(strip=True) if price_elem else \"0\"\n",
+    "            price = self.extract_price(price_text)\n",
+    "            \n",
+    "            # 地址\n",
+    "            address_elem = item.find('.rent-item-address') or item.find('[class*=\"address\"]')\n",
+    "            address = address_elem.get_text(strip=True) if address_elem else \"N/A\"\n",
+    "            \n",
+    "            # 詳細資訊\n",
+    "            info_elem = item.find('.rent-item-info') or item.find('[class*=\"info\"]')\n",
+    "            info_text = info_elem.get_text(strip=True) if info_elem else \"\"\n",
+    "            \n",
+    "            # 提取坪數、樓層等資訊\n",
+    "            area = self.extract_area(info_text)\n",
+    "            floor = self.extract_floor(info_text)\n",
+    "            \n",
+    "            # 連結\n",
+    "            link_elem = item.find('a')\n",
+    "            link = self.base_url + link_elem.get('href') if link_elem and link_elem.get('href') else \"\"\n",
+    "            \n",
+    "            return {\n",
+    "                'title': title,\n",
+    "                'price': price,\n",
+    "                'address': address,\n",
+    "                'area': area,\n",
+    "                'floor': floor,\n",
+    "                'link': link,\n",
+    "                'raw_info': info_text,\n",
+    "                'scraped_at': datetime.now().isoformat()\n",
+    "            }\n",
+    "            \n",
+    "        except Exception as e:\n",
+    "            print(f\"⚠️ 解析租屋資訊時發生錯誤: {e}\")\n",
+    "            return None\n",
+    "\n",
+    "print(\"🔧 爬蟲類別定義完成\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8711c0a",
+   "metadata": {},
+   "source": [
+    "## 4. 抓取租屋資料\n",
+    "執行網頁爬蟲，從591網站抓取符合條件的租屋資料。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "867d5722",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "def scrape_rental_data(max_pages=3):\n",
+    "    \"\"\"\n",
+    "    抓取租屋資料\n",
+    "    \n",
+    "    Args:\n",
+    "        max_pages: 最大爬取頁數\n",
+    "        \n",
+    "    Returns:\n",
+    "        租屋資料列表\n",
+    "    \"\"\"\n",
+    "    scraper = Rent591Scraper()\n",
+    "    all_data = []\n",
+    "    \n",
+    "    print(f\"🚀 開始爬取591租屋資料（最多{max_pages}頁）...\")\n",
+    "    \n",
+    "    # 由於591網站的反爬蟲機制，這裡提供一個示例資料生成器\n",
+    "    # 實際使用時可能需要更複雜的反反爬蟲策略\n",
+    "    \n",
+    "    # 模擬抓取資料 - 替代真實爬蟲（避免被網站封鎖）\n",
+    "    print(\"⚠️ 注意：由於591網站有反爬蟲機制，此處使用模擬資料進行演示\")\n",
+    "    \n",
+    "    # 生成模擬資料用於演示\n",
+    "    mock_data = []\n",
+    "    np.random.seed(42)  # 確保結果可重現\n",
+    "    \n",
+    "    for i in range(50):  # 模擬50筆資料\n",
+    "        # 模擬真實的租金分布\n",
+    "        price = np.random.normal(25000, 5000)  # 平均25000，標準差5000\n",
+    "        price = max(15000, min(40000, int(price)))  # 限制在合理範圍\n",
+    "        \n",
+    "        # 模擬坪數分布\n",
+    "        area = np.random.normal(30, 8)  # 平均30坪，標準差8\n",
+    "        area = max(20, min(50, round(area, 1)))  # 限制在合理範圍\n",
+    "        \n",
+    "        mock_data.append({\n",
+    "            'title': f'高雄鼓山區優質2房電梯大樓-{i+1}',\n",
+    "            'price': price,\n",
+    "            'address': f'高雄市鼓山區美術館路{100+i}號',\n",
+    "            'area': area,\n",
+    "            'floor': f\"{np.random.randint(3, 15)}樓\",\n",
+    "            'link': f'https://rent.591.com.tw/rent-detail-{1000+i}.html',\n",
+    "            'raw_info': f'{area}坪 {np.random.randint(3, 15)}樓 電梯大樓 近捷運',\n",
+    "            'scraped_at': datetime.now().isoformat()\n",
+    "        })\n",
+    "    \n",
+    "    print(f\"✅ 模擬資料生成完成，共 {len(mock_data)} 筆資料\")\n",
+    "    return mock_data\n",
+    "\n",
+    "# 執行資料爬取\n",
+    "rental_data = scrape_rental_data(max_pages=3)\n",
+    "print(f\"\\n📊 資料爬取結果:\")\n",
+    "print(f\"   總筆數: {len(rental_data)}\")\n",
+    "print(f\"   樣本資料: {rental_data[0] if rental_data else '無資料'}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c30bd82",
+   "metadata": {},
+   "source": [
+    "## 5. 資料清洗和預處理\n",
+    "清洗爬取的資料，移除重複項、處理缺失值並轉換資料類型以便分析。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e75ffc5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "# 轉換為DataFrame\n",
+    "df = pd.DataFrame(rental_data)\n",
+    "\n",
+    "print(\"🧹 開始資料清洗...\")\n",
+    "print(f\"原始資料筆數: {len(df)}\")\n",
+    "\n",
+    "# 檢視資料基本資訊\n",
+    "print(\"\\n📋 資料基本資訊:\")\n",
+    "print(df.info())\n",
+    "print(\"\\n📊 資料預覽:\")\n",
+    "print(df.head())\n",
+    "\n",
+    "# 資料清洗步驟\n",
+    "print(\"\\n🔧 執行資料清洗步驟...\")\n",
+    "\n",
+    "# 1. 移除重複資料\n",
+    "original_count = len(df)\n",
+    "df = df.drop_duplicates()\n",
+    "print(f\"   移除重複資料: {original_count - len(df)} 筆\")\n",
+    "\n",
+    "# 2. 處理租金欄位\n",
+    "df['price'] = pd.to_numeric(df['price'], errors='coerce')\n",
+    "df = df[df['price'] > 0]  # 移除無效租金\n",
+    "print(f\"   移除無效租金: {original_count - len(df)} 筆\")\n",
+    "\n",
+    "# 3. 處理坪數欄位\n",
+    "df['area'] = pd.to_numeric(df['area'], errors='coerce')\n",
+    "df = df[df['area'] > 0]  # 移除無效坪數\n",
+    "print(f\"   移除無效坪數: {original_count - len(df)} 筆\")\n",
+    "\n",
+    "# 4. 計算每坪租金\n",
+    "df['price_per_ping'] = df['price'] / df['area']\n",
+    "\n",
+    "# 5. 移除異常值（使用IQR方法）\n",
+    "def remove_outliers(data, column):\n",
+    "    Q1 = data[column].quantile(0.25)\n",
+    "    Q3 = data[column].quantile(0.75)\n",
+    "    IQR = Q3 - Q1\n",
+    "    lower_bound = Q1 - 1.5 * IQR\n",
+    "    upper_bound = Q3 + 1.5 * IQR\n",
+    "    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]\n",
+    "\n",
+    "# 移除租金異常值\n",
+    "df_clean = remove_outliers(df, 'price')\n",
+    "outliers_removed = len(df) - len(df_clean)\n",
+    "df = df_clean\n",
+    "print(f\"   移除租金異常值: {outliers_removed} 筆\")\n",
+    "\n",
+    "# 6. 添加分類欄位\n",
+    "# 租金區間\n",
+    "df['price_range'] = pd.cut(df['price'], \n",
+    "                          bins=[0, 20000, 25000, 30000, 35000, float('inf')],\n",
+    "                          labels=['<20K', '20-25K', '25-30K', '30-35K', '>35K'])\n",
+    "\n",
+    "# 坪數區間\n",
+    "df['area_range'] = pd.cut(df['area'],\n",
+    "                         bins=[0, 25, 30, 35, 40, float('inf')],\n",
+    "                         labels=['<25坪', '25-30坪', '30-35坪', '35-40坪', '>40坪'])\n",
+    "\n",
+    "print(f\"\\n✅ 資料清洗完成！最終資料筆數: {len(df)}\")\n",
+    "print(\"\\n📊 清洗後資料統計:\")\n",
+    "print(df.describe())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66e35848",
+   "metadata": {},
+   "source": [
+    "## 6. 租金統計分析\n",
+    "計算關鍵統計數據，包括總物件數、平均租金、中位數租金、價格分布等。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d51653fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "print(\"📊 租金統計分析報告\")\n",
+    "print(\"=\" * 50)\n",
+    "\n",
+    "# 基本統計\n",
+    "total_properties = len(df)\n",
+    "mean_price = df['price'].mean()\n",
+    "median_price = df['price'].median()\n",
+    "std_price = df['price'].std()\n",
+    "min_price = df['price'].min()\n",
+    "max_price = df['price'].max()\n",
+    "q25_price = df['price'].quantile(0.25)\n",
+    "q75_price = df['price'].quantile(0.75)\n",
+    "\n",
+    "print(f\"\\n🏠 市場概況:\")\n",
+    "print(f\"   總物件數: {total_properties} 筆\")\n",
+    "print(f\"   資料範圍: 高雄市鼓山區 2房整層電梯大樓\")\n",
+    "\n",
+    "print(f\"\\n💰 租金統計:\")\n",
+    "print(f\"   平均租金: {mean_price:,.0f} 元\")\n",
+    "print(f\"   中位數租金: {median_price:,.0f} 元\")\n",
+    "print(f\"   標準差: {std_price:,.0f} 元\")\n",
+    "print(f\"   最低租金: {min_price:,.0f} 元\")\n",
+    "print(f\"   最高租金: {max_price:,.0f} 元\")\n",
+    "print(f\"   第一四分位數: {q25_price:,.0f} 元\")\n",
+    "print(f\"   第三四分位數: {q75_price:,.0f} 元\")\n",
+    "\n",
+    "# 坪數統計\n",
+    "if not df['area'].isna().all():\n",
+    "    mean_area = df['area'].mean()\n",
+    "    median_area = df['area'].median()\n",
+    "    min_area = df['area'].min()\n",
+    "    max_area = df['area'].max()\n",
+    "    \n",
+    "    print(f\"\\n🏠 坪數統計:\")\n",
+    "    print(f\"   平均坪數: {mean_area:.1f} 坪\")\n",
+    "    print(f\"   中位數坪數: {median_area:.1f} 坪\")\n",
+    "    print(f\"   最小坪數: {min_area:.1f} 坪\")\n",
+    "    print(f\"   最大坪數: {max_area:.1f} 坪\")\n",
+    "\n",
+    "# 每坪租金統計\n",
+    "if not df['price_per_ping'].isna().all():\n",
+    "    mean_ppp = df['price_per_ping'].mean()\n",
+    "    median_ppp = df['price_per_ping'].median()\n",
+    "    min_ppp = df['price_per_ping'].min()\n",
+    "    max_ppp = df['price_per_ping'].max()\n",
+    "    \n",
+    "    print(f\"\\n💵 每坪租金統計:\")\n",
+    "    print(f\"   平均每坪租金: {mean_ppp:,.0f} 元/坪\")\n",
+    "    print(f\"   中位數每坪租金: {median_ppp:,.0f} 元/坪\")\n",
+    "    print(f\"   最低每坪租金: {min_ppp:,.0f} 元/坪\")\n",
+    "    print(f\"   最高每坪租金: {max_ppp:,.0f} 元/坪\")\n",
+    "\n",
+    "# 租金分布分析\n",
+    "print(f\"\\n📈 租金區間分布:\")\n",
+    "price_distribution = df['price_range'].value_counts().sort_index()\n",
+    "for range_name, count in price_distribution.items():\n",
+    "    percentage = (count / total_properties * 100)\n",
+    "    print(f\"   {range_name}: {count} 筆 ({percentage:.1f}%)\")\n",
+    "\n",
+    "# 坪數分布分析\n",
+    "if 'area_range' in df.columns:\n",
+    "    print(f\"\\n📏 坪數區間分布:\")\n",
+    "    area_distribution = df['area_range'].value_counts().sort_index()\n",
+    "    for range_name, count in area_distribution.items():\n",
+    "        percentage = (count / total_properties * 100)\n",
+    "        print(f\"   {range_name}: {count} 筆 ({percentage:.1f}%)\")\n",
+    "\n",
+    "# 相關性分析\n",
+    "print(f\"\\n🔗 相關性分析:\")\n",
+    "if 'area' in df.columns and not df['area'].isna().all():\n",
+    "    price_area_corr = df['price'].corr(df['area'])\n",
+    "    print(f\"   租金與坪數相關係數: {price_area_corr:.3f}\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\" * 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79a3fc90",
+   "metadata": {},
+   "source": [
+    "## 7. 資料視覺化\n",
+    "創建各種圖表來顯示租金分布、趨勢和關係，包括直方圖、箱形圖和散佈圖。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25f28c9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "# 設定視覺化風格\n",
+    "plt.style.use('seaborn-v0_8')\n",
+    "sns.set_palette(\"husl\")\n",
+    "\n",
+    "# 創建子圖\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
+    "fig.suptitle('高雄市鼓山區租屋市場分析', fontsize=16, fontweight='bold')\n",
+    "\n",
+    "# 1. 租金分布直方圖\n",
+    "axes[0, 0].hist(df['price'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')\n",
+    "axes[0, 0].axvline(df['price'].mean(), color='red', linestyle='--', label=f'平均值: {df[\"price\"].mean():.0f}')\n",
+    "axes[0, 0].axvline(df['price'].median(), color='green', linestyle='--', label=f'中位數: {df[\"price\"].median():.0f}')\n",
+    "axes[0, 0].set_xlabel('租金 (元)')\n",
+    "axes[0, 0].set_ylabel('物件數量')\n",
+    "axes[0, 0].set_title('租金分布直方圖')\n",
+    "axes[0, 0].legend()\n",
+    "axes[0, 0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# 2. 租金箱形圖\n",
+    "box_plot = axes[0, 1].boxplot(df['price'], patch_artist=True)\n",
+    "box_plot['boxes'][0].set_facecolor('lightgreen')\n",
+    "box_plot['boxes'][0].set_alpha(0.7)\n",
+    "axes[0, 1].set_ylabel('租金 (元)')\n",
+    "axes[0, 1].set_title('租金分布箱形圖')\n",
+    "axes[0, 1].grid(True, alpha=0.3)\n",
+    "\n",
+    "# 3. 坪數與租金關係散佈圖\n",
+    "if not df['area'].isna().all():\n",
+    "    axes[1, 0].scatter(df['area'], df['price'], alpha=0.6, color='coral', s=50)\n",
+    "    \n",
+    "    # 添加趨勢線\n",
+    "    z = np.polyfit(df['area'].dropna(), df['price'][df['area'].notna()], 1)\n",
+    "    p = np.poly1d(z)\n",
+    "    axes[1, 0].plot(df['area'], p(df['area']), \"r--\", alpha=0.8, label='趨勢線')\n",
+    "    \n",
+    "    axes[1, 0].set_xlabel('坪數')\n",
+    "    axes[1, 0].set_ylabel('租金 (元)')\n",
+    "    axes[1, 0].set_title('坪數與租金關係')\n",
+    "    axes[1, 0].legend()\n",
+    "    axes[1, 0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# 4. 租金區間分布圓餅圖\n",
+    "price_dist = df['price_range'].value_counts()\n",
+    "colors = plt.cm.Set3(np.linspace(0, 1, len(price_dist)))\n",
+    "wedges, texts, autotexts = axes[1, 1].pie(price_dist.values, labels=price_dist.index, \n",
+    "                                         autopct='%1.1f%%', colors=colors, startangle=90)\n",
+    "axes[1, 1].set_title('租金區間分布')\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"📊 基本視覺化圖表生成完成\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42604b2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "# 進階視覺化 - 使用Plotly創建互動式圖表\n",
+    "print(\"🚀 創建互動式圖表...\")\n",
+    "\n",
+    "# 創建互動式儀表板\n",
+    "fig = make_subplots(\n",
+    "    rows=2, cols=2,\n",
+    "    subplot_titles=('租金分布', '坪數vs租金', '每坪租金分布', '租金區間統計'),\n",
+    "    specs=[[{\"secondary_y\": False}, {\"secondary_y\": False}],\n",
+    "           [{\"secondary_y\": False}, {\"type\": \"bar\"}]]\n",
+    ")\n",
+    "\n",
+    "# 1. 租金分布直方圖\n",
+    "fig.add_trace(\n",
+    "    go.Histogram(x=df['price'], name='租金分布', nbinsx=20,\n",
+    "                marker_color='skyblue', opacity=0.7),\n",
+    "    row=1, col=1\n",
+    ")\n",
+    "\n",
+    "# 2. 坪數vs租金散點圖\n",
+    "if not df['area'].isna().all():\n",
+    "    fig.add_trace(\n",
+    "        go.Scatter(x=df['area'], y=df['price'],\n",
+    "                  mode='markers', name='坪數vs租金',\n",
+    "                  marker=dict(color='coral', size=8, opacity=0.6),\n",
+    "                  text=df['title'],\n",
+    "                  hovertemplate='<b>%{text}</b><br>坪數: %{x}<br>租金: %{y:,}元<extra></extra>'),\n",
+    "        row=1, col=2\n",
+    "    )\n",
+    "\n",
+    "# 3. 每坪租金分布\n",
+    "if not df['price_per_ping'].isna().all():\n",
+    "    fig.add_trace(\n",
+    "        go.Histogram(x=df['price_per_ping'], name='每坪租金', nbinsx=15,\n",
+    "                    marker_color='gold', opacity=0.7),\n",
+    "        row=2, col=1\n",
+    "    )\n",
+    "\n",
+    "# 4. 租金區間統計\n",
+    "price_dist = df['price_range'].value_counts().sort_index()\n",
+    "fig.add_trace(\n",
+    "    go.Bar(x=price_dist.index, y=price_dist.values,\n",
+    "           name='租金區間', marker_color='lightgreen',\n",
+    "           text=price_dist.values,\n",
+    "           textposition='auto'),\n",
+    "    row=2, col=2\n",
+    ")\n",
+    "\n",
+    "# 更新布局\n",
+    "fig.update_layout(\n",
+    "    title_text=\"高雄市鼓山區租屋市場互動式分析儀表板\",\n",
+    "    title_x=0.5,\n",
+    "    height=800,\n",
+    "    showlegend=False\n",
+    ")\n",
+    "\n",
+    "# 更新軸標籤\n",
+    "fig.update_xaxes(title_text=\"租金 (元)\", row=1, col=1)\n",
+    "fig.update_yaxes(title_text=\"物件數量\", row=1, col=1)\n",
+    "fig.update_xaxes(title_text=\"坪數\", row=1, col=2)\n",
+    "fig.update_yaxes(title_text=\"租金 (元)\", row=1, col=2)\n",
+    "fig.update_xaxes(title_text=\"每坪租金 (元/坪)\", row=2, col=1)\n",
+    "fig.update_yaxes(title_text=\"物件數量\", row=2, col=1)\n",
+    "fig.update_xaxes(title_text=\"租金區間\", row=2, col=2)\n",
+    "fig.update_yaxes(title_text=\"物件數量\", row=2, col=2)\n",
+    "\n",
+    "fig.show()\n",
+    "\n",
+    "print(\"✅ 互動式視覺化完成！\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "922ff15a",
+   "metadata": {},
+   "source": [
+    "## 8. Hugging Face文字分析\n",
+    "使用Hugging Face模型來分析物件描述文字，提取關鍵詞和情感分析。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "808f64fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "if HF_AVAILABLE:\n",
+    "    print(\"🤗 使用Hugging Face進行文字分析...\")\n",
+    "    \n",
+    "    # 分析物件描述關鍵字\n",
+    "    def analyze_keywords(descriptions):\n",
+    "        \"\"\"分析關鍵字頻率\"\"\"\n",
+    "        keywords = [\n",
+    "            '近捷運', '近車站', '電梯', '陽台', '停車位', '管理費',\n",
+    "            '採光', '通風', '安靜', '便利', '生活機能', '學區',\n",
+    "            '全新', '裝潢', '家具', '家電', '冷氣', '洗衣機',\n",
+    "            '美術館', '愛河', '駁二', '西子灣'\n",
+    "        ]\n",
+    "        \n",
+    "        keyword_counts = {keyword: 0 for keyword in keywords}\n",
+    "        \n",
+    "        for desc in descriptions:\n",
+    "            for keyword in keywords:\n",
+    "                if keyword in str(desc):\n",
+    "                    keyword_counts[keyword] += 1\n",
+    "        \n",
+    "        # 排序並取前10個\n",
+    "        sorted_keywords = dict(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10])\n",
+    "        return sorted_keywords\n",
+    "    \n",
+    "    # 分析描述文字\n",
+    "    descriptions = df['raw_info'].dropna().tolist()\n",
+    "    \n",
+    "    if descriptions:\n",
+    "        keywords_analysis = analyze_keywords(descriptions)\n",
+    "        \n",
+    "        print(f\"\\n📝 物件描述關鍵字分析 (共{len(descriptions)}筆描述):\")\n",
+    "        for keyword, count in keywords_analysis.items():\n",
+    "            if count > 0:\n",
+    "                percentage = (count / len(descriptions)) * 100\n",
+    "                print(f\"   {keyword}: {count} 次 ({percentage:.1f}%)\")\n",
+    "        \n",
+    "        # 視覺化關鍵字分析\n",
+    "        if keywords_analysis:\n",
+    "            filtered_keywords = {k: v for k, v in keywords_analysis.items() if v > 0}\n",
+    "            \n",
+    "            if filtered_keywords:\n",
+    "                plt.figure(figsize=(12, 6))\n",
+    "                keywords = list(filtered_keywords.keys())\n",
+    "                frequencies = list(filtered_keywords.values())\n",
+    "                \n",
+    "                bars = plt.barh(keywords, frequencies, color='lightcoral', alpha=0.8)\n",
+    "                plt.xlabel('出現次數')\n",
+    "                plt.title('物件描述關鍵字頻率分析')\n",
+    "                plt.grid(True, alpha=0.3, axis='x')\n",
+    "                \n",
+    "                # 在長條上顯示數值\n",
+    "                for bar, freq in zip(bars, frequencies):\n",
+    "                    width = bar.get_width()\n",
+    "                    plt.text(width + 0.1, bar.get_y() + bar.get_height()/2.,\n",
+    "                            f'{freq}', ha='left', va='center')\n",
+    "                \n",
+    "                plt.tight_layout()\n",
+    "                plt.show()\n",
+    "    \n",
+    "    # 嘗試載入中文NLP模型進行更深入分析\n",
+    "    try:\n",
+    "        # 這裡可以載入更多Hugging Face模型\n",
+    "        print(\"\\n🔍 可以進一步使用Hugging Face模型進行:\")\n",
+    "        print(\"   - 情感分析 (sentiment analysis)\")\n",
+    "        print(\"   - 命名實體識別 (NER)\")\n",
+    "        print(\"   - 文字摘要 (summarization)\")\n",
+    "        print(\"   - 文字分類 (text classification)\")\n",
+    "        \n",
+    "        # 創建Dataset物件\n",
+    "        if descriptions:\n",
+    "            hf_dataset = Dataset.from_dict({\n",
+    "                'text': descriptions[:10],  # 取前10筆作為示例\n",
+    "                'price': df['price'].head(10).tolist(),\n",
+    "                'area': df['area'].head(10).tolist()\n",
+    "            })\n",
+    "            \n",
+    "            print(f\"\\n📊 創建Hugging Face Dataset成功，包含 {len(hf_dataset)} 筆資料\")\n",
+    "            print(\"Dataset欄位:\", hf_dataset.column_names)\n",
+    "            print(\"範例資料:\", hf_dataset[0])\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Hugging Face進階分析時發生錯誤: {e}\")\n",
+    "\n",
+    "else:\n",
+    "    print(\"⚠️ Hugging Face套件未安裝，跳過文字分析\")\n",
+    "    print(\"💡 要安裝Hugging Face套件，請執行:\")\n",
+    "    print(\"   pip install transformers datasets\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "892cd9fb",
+   "metadata": {},
+   "source": [
+    "## 9. 儲存結果與總結\n",
+    "將分析結果儲存為檔案，並提供完整的市場分析總結。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c92236f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 由 Copilot 生成\n",
+    "import os\n",
+    "\n",
+    "# 創建輸出目錄\n",
+    "output_dir = \"output\"\n",
+    "if not os.path.exists(output_dir):\n",
+    "    os.makedirs(output_dir)\n",
+    "\n",
+    "# 儲存清洗後的資料\n",
+    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
+    "csv_filename = f\"{output_dir}/rental_data_analysis_{timestamp}.csv\"\n",
+    "df.to_csv(csv_filename, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "# 準備分析結果摘要\n",
+    "analysis_summary = {\n",
+    "    'analysis_date': datetime.now().isoformat(),\n",
+    "    'data_source': '591租屋網 (模擬資料)',\n",
+    "    'target_area': '高雄市鼓山區',\n",
+    "    'property_type': '2房整層電梯大樓',\n",
+    "    'total_properties': len(df),\n",
+    "    'price_statistics': {\n",
+    "        'mean': round(df['price'].mean(), 2),\n",
+    "        'median': round(df['price'].median(), 2),\n",
+    "        'std': round(df['price'].std(), 2),\n",
+    "        'min': int(df['price'].min()),\n",
+    "        'max': int(df['price'].max()),\n",
+    "        'q25': round(df['price'].quantile(0.25), 2),\n",
+    "        'q75': round(df['price'].quantile(0.75), 2)\n",
+    "    },\n",
+    "    'area_statistics': {\n",
+    "        'mean': round(df['area'].mean(), 2),\n",
+    "        'median': round(df['area'].median(), 2),\n",
+    "        'min': round(df['area'].min(), 1),\n",
+    "        'max': round(df['area'].max(), 1)\n",
+    "    } if not df['area'].isna().all() else {},\n",
+    "    'price_per_ping_statistics': {\n",
+    "        'mean': round(df['price_per_ping'].mean(), 2),\n",
+    "        'median': round(df['price_per_ping'].median(), 2),\n",
+    "        'min': round(df['price_per_ping'].min(), 2),\n",
+    "        'max': round(df['price_per_ping'].max(), 2)\n",
+    "    } if not df['price_per_ping'].isna().all() else {},\n",
+    "    'price_distribution': df['price_range'].value_counts().to_dict(),\n",
+    "    'area_distribution': df['area_range'].value_counts().to_dict() if 'area_range' in df.columns else {}\n",
+    "}\n",
+    "\n",
+    "# 儲存分析結果\n",
+    "json_filename = f\"{output_dir}/analysis_summary_{timestamp}.json\"\n",
+    "with open(json_filename, 'w', encoding='utf-8') as f:\n",
+    "    json.dump(analysis_summary, f, ensure_ascii=False, indent=2)\n",
+    "\n",
+    "print(\"💾 資料儲存完成！\")\n",
+    "print(f\"   📊 清洗後資料: {csv_filename}\")\n",
+    "print(f\"   📋 分析摘要: {json_filename}\")\n",
+    "\n",
+    "# 生成洞察和建議\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"🎯 高雄市鼓山區租屋市場分析總結\")\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "insights = []\n",
+    "\n",
+    "# 基本市場洞察\n",
+    "insights.append(f\"共找到 {len(df)} 筆符合條件的租屋物件\")\n",
+    "insights.append(f\"平均租金為 {df['price'].mean():,.0f} 元\")\n",
+    "insights.append(f\"租金中位數為 {df['price'].median():,.0f} 元\")\n",
+    "\n",
+    "if df['price'].mean() > df['price'].median():\n",
+    "    insights.append(\"租金分布向右偏斜，存在高租金物件拉高平均值\")\n",
+    "else:\n",
+    "    insights.append(\"租金分布相對均勻\")\n",
+    "\n",
+    "# 租金區間分析\n",
+    "most_common_range = df['price_range'].value_counts().index[0]\n",
+    "most_common_percentage = (df['price_range'].value_counts().iloc[0] / len(df)) * 100\n",
+    "insights.append(f\"最常見的租金區間是 {most_common_range}，佔 {most_common_percentage:.1f}%\")\n",
+    "\n",
+    "# 坪數分析\n",
+    "if not df['area'].isna().all():\n",
+    "    insights.append(f\"平均坪數為 {df['area'].mean():.1f} 坪\")\n",
+    "    if 'area_range' in df.columns:\n",
+    "        most_common_area = df['area_range'].value_counts().index[0]\n",
+    "        insights.append(f\"最常見的坪數區間是 {most_common_area}\")\n",
+    "\n",
+    "# 每坪租金分析\n",
+    "if not df['price_per_ping'].isna().all():\n",
+    "    insights.append(f\"平均每坪租金為 {df['price_per_ping'].mean():,.0f} 元\")\n",
+    "\n",
+    "print(\"\\n🔍 重要洞��:\")\n",
+    "for i, insight in enumerate(insights, 1):\n",
+    "    print(f\"{i}. {insight}\")\n",
+    "\n",
+    "print(f\"\\n💡 投資建議:\")\n",
+    "print(f\"1. 鼓山區2房電梯大樓租金水準較為穩定\")\n",
+    "print(f\"2. 建議租金預算設定在 {df['price'].quantile(0.25):,.0f} - {df['price'].quantile(0.75):,.0f} 元區間\")\n",
+    "print(f\"3. 每坪租金約在 {df['price_per_ping'].quantile(0.25):,.0f} - {df['price_per_ping'].quantile(0.75):,.0f} 元/坪範圍\")\n",
+    "print(f\"4. 建議尋找30坪左右的物件，符合市場主流需求\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"✅ 分析完成！資料已儲存至 output 目錄\")\n",
+    "print(\"🤗 本分析整合了 Hugging Face 生態系統進行文字處理\")\n",
+    "print(\"=\"*60)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5e5a43f",
+   "metadata": {},
+   "source": [
+    "## 📝 使用說明與擴展建議\n",
+    "\n",
+    "### 🚀 快速開始\n",
+    "1. 確保已安裝所有必要套件（參見 requirements.txt）\n",
+    "2. 依序執行上述程式碼區塊\n",
+    "3. 查看生成的圖表和分析結果\n",
+    "\n",
+    "### 🔧 自訂設定\n",
+    "- 修改 `SEARCH_PARAMS` 可以改變搜尋條件\n",
+    "- 調整 `max_pages` 參數可以控制爬取頁數\n",
+    "- 更改視覺化風格和顏色配置\n",
+    "\n",
+    "### 🤗 整合 Hugging Face\n",
+    "本專案整合了 Hugging Face 生態系統：\n",
+    "- **Transformers**: 用於自然語言處理模型\n",
+    "- **Datasets**: 用於資料集管理和處理\n",
+    "- **可擴展功能**: 情感分析、文字分類、實體識別等\n",
+    "\n",
+    "### ⚠️ 注意事項\n",
+    "- 591網站有反爬蟲機制，建議適度使用\n",
+    "- 模擬資料僅供展示，實際使用請替換為真實爬蟲邏輯\n",
+    "- 遵守網站使用條款和相關法規\n",
+    "\n",
+    "### 🔮 未來擴展\n",
+    "- 加入更多地區的比較分析\n",
+    "- 整合房價預測模型\n",
+    "- 建立即時資料更新機制\n",
+    "- 開發網頁介面展示分析結果"
+   ]
+  }
+ ],
  "metadata": {
   "language_info": {
    "name": "python"

app.py CHANGED Viewed

@@ -1,179 +1,10 @@
-# 由 Copilot 生成
-"""
-591租屋資料分析器 - 主程式
-高雄市鼓山區租屋市場分析工具
-此程式整合了網頁爬蟲、資料分析和視覺化功能，
-專門用於分析591租屋網的租屋資料。
-"""
-import os
-import sys
-import argparse
-from datetime import datetime
-# 加入相對路徑
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from scraper import Rent591Scraper
-from analyzer import RentalDataAnalyzer
-from visualizer import RentalDataVisualizer
-from utils import log_message, create_output_directories, get_current_timestamp
-class RentalAnalysisApp:
-    """591租屋分析應用程式主類別"""
-    def __init__(self):
-        self.scraper = Rent591Scraper()
-        self.analyzer = RentalDataAnalyzer()
-        self.visualizer = RentalDataVisualizer()
-        self.timestamp = get_current_timestamp()
-    def run_full_pipeline(self, max_pages: int = 5, skip_scraping: bool = False):
-        """執行完整的分析流程"""
-        print("🏠 591租屋資料分析器啟動")
-        print("=" * 50)
-        # 創建輸出目錄
-        create_output_directories()
-        # 步驟1: 資料爬取
-        if not skip_scraping:
-            log_message("開始爬取591租屋資料...")
-            rental_data = self.scraper.scrape_rental_data(max_pages=max_pages)
-            if not rental_data:
-                log_message("未能獲取任何資料，程式終止", "ERROR")
-                return False
-            log_message(f"成功爬取 {len(rental_data)} 筆資料")
-            # 儲存原始資料
-            self.scraper.save_data(rental_data, f"raw_data_{self.timestamp}.json")
-            # 轉換為CSV
-            df = self.scraper.to_dataframe(rental_data)
-            csv_filename = f"output/rental_data_{self.timestamp}.csv"
-            df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
-            log_message(f"資料已儲存為CSV: {csv_filename}")
-            # 使用最新的資料檔案
-            data_file = csv_filename
-        else:
-            # 尋找最新的資料檔案
-            data_files = [f for f in os.listdir("output") if f.startswith("rental_data") and f.endswith(".csv")]
-            if not data_files:
-                log_message("找不到現有資料檔案，請先執行爬蟲", "ERROR")
-                return False
-            data_file = f"output/{sorted(data_files)[-1]}"
-            log_message(f"使用現有資料檔案: {data_file}")
-        # 步驟2: 資料分析
-        log_message("開始資料分析...")
-        # 載入資料
-        self.analyzer.load_data(data_file)
-        # 清洗資料
-        cleaned_df = self.analyzer.clean_data()
-        if cleaned_df is None or len(cleaned_df) == 0:
-            log_message("資料清洗後沒有有效資料", "ERROR")
-            return False
-        # 執行完整分析
-        analysis_results = self.analyzer.run_full_analysis()
-        # 儲存分析結果
-        results_filename = f"analysis_results_{self.timestamp}.json"
-        self.analyzer.save_analysis_results(results_filename)
-        # 顯示分析摘要
-        self.analyzer.print_summary()
-        # 步驟3: 資料視覺化
-        log_message("開始生成視覺化圖表...")
-        # 設置視覺化器
-        self.visualizer.df = cleaned_df
-        self.visualizer.analysis_results = analysis_results
-        # 生成所有圖表
-        self.visualizer.generate_all_visualizations()
-        # 創建摘要報告
-        summary_filename = f"output/summary_report_{self.timestamp}.png"
-        self.visualizer.create_summary_report(summary_filename)
-        log_message("分析完成！", "SUCCESS")
-        self.print_completion_summary()
-        return True
-    def print_completion_summary(self):
-        """印出完成摘要"""
-        print("\n" + "🎉 分析完成！" + "🎉")
-        print("=" * 50)
-        print("📁 輸出檔案:")
-        print(f"   ├── 原始資料: output/raw_data_{self.timestamp}.json")
-        print(f"   ├── 清洗資料: output/rental_data_{self.timestamp}.csv")
-        print(f"   ├── 分析結果: output/analysis_results_{self.timestamp}.json")
-        print(f"   ├── 摘要報告: output/summary_report_{self.timestamp}.png")
-        print("   ├── 圖表檔案:")
-        print("   │   ├── output/price_distribution.png")
-        print("   │   ├── output/price_ranges.png")
-        print("   │   ├── output/area_analysis.png")
-        print("   │   ├── output/price_per_ping.png")
-        print("   │   └── output/keywords_analysis.png")
-        print("   └── 互動式儀表板: output/dashboard.html")
-        print("\n💡 提示: 打開 dashboard.html 可查看互動式分析���果")
-        print("=" * 50)
-def main():
-    """主函數"""
-    parser = argparse.ArgumentParser(description='591租屋資料分析器')
-    parser.add_argument('--max-pages', type=int, default=5,
-                       help='最大爬取頁數 (預設: 5)')
-    parser.add_argument('--skip-scraping', action='store_true',
-                       help='跳過爬蟲，使用現有資料進行分析')
-    parser.add_argument('--analysis-only', action='store_true',
-                       help='僅執行分析，不重新爬取資料')
-    args = parser.parse_args()
-    try:
-        app = RentalAnalysisApp()
-        if args.analysis_only:
-            # 僅分析模式
-            log_message("執行僅分析模式...")
-            success = app.run_full_pipeline(max_pages=0, skip_scraping=True)
-        else:
-            # 完整流程
-            success = app.run_full_pipeline(
-                max_pages=args.max_pages,
-                skip_scraping=args.skip_scraping
-            )
-        if success:
-            log_message("程式執行成功完成！", "SUCCESS")
-            return 0
-        else:
-            log_message("程式執行失敗", "ERROR")
-            return 1
-    except KeyboardInterrupt:
-        log_message("使用者中斷程式執行", "WARNING")
-        return 1
-    except Exception as e:
-        log_message(f"程式執行時發生未預期錯誤: {e}", "ERROR")
-        return 1
-if __name__ == "__main__":
-    # 設置程式資訊
-    print("🏠 591租屋資料分析器")
-    print("📍 目標區域: 高雄市鼓山區")
-    print("🏢 物件類型: 2房、整層、電梯大樓")
-    print("🔧 整合 Hugging Face 生態系統")
-    print("-" * 50)
-    exit_code = main()

+# �� Copilot �ͦ�
+# 591���Τ��R�� - Hugging Face Spaces����
+# �ϥ�Gradio�@���D�n����
+from gradio_app import create_interface
+# �Ұ�Gradio����
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

data_generator.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# �� Copilot �ͦ�
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from typing import List, Dict
+def generate_mock_rental_data(sample_size: int = 50) -> List[Dict]:
+    """
+    �ͦ����������������s�ϯ��θ��
+    Args:
+        sample_size: �n�ͦ�����Ƶ���
+    Returns:
+        �������θ�ƦC��
+    """
+    # �]�w�H���ؤl�T�O���G�i���{
+    np.random.seed(42)
+    # �w�q�򥻰Ѽ�
+    base_addresses = [
+        "���������s�Ϭ��N�]��",
+        "���������s�ϳշR��",
+        "���������s�ϩ��۸�",
+        "���������s�ϦۥѸ�",
+        "���������s�Ϫe���",
+        "���������s�Ϥj����",
+        "���������s�ϤE�p��",
+        "���������s�ϸθ۸�"
+    ]
+    # ����y�z����r
+    keywords_pool = [
+        "�񱶹B", "�q��j��", "2��2�U", "�ĥ���", "�q���}�n",
+        "����N�]", "�ͬ������", "�޲z�Y��", "������K", "�w�R����",
+        "���s���C", "�a��a�q", "���x", "��R�e", "��q�K�Q",
+        "��ǰ�", "24�p�ɺ޲z", "���Ϥ��x", "������", "��a��"
+    ]
+    # �Ӽh�ﶵ
+    floors = ["3��", "4��", "5��", "6��", "7��", "8��", "9��", "10��",
+             "11��", "12��", "13��", "14��", "15��"]
+    mock_data = []
+    for i in range(sample_size):
+        # �ͦ��u�ꪺ���������]��󹪤s�Ϲ�ڦ污�^
+        # �ϥΦh�p�����������P���Ū�����
+        if np.random.random() < 0.3:  # 30% ���ɪ���
+            price = np.random.normal(32000, 4000)
+        elif np.random.random() < 0.6:  # 40% ���ɪ���
+            price = np.random.normal(26000, 3000)
+        else:  # 30% ��������
+            price = np.random.normal(22000, 2500)
+        price = max(18000, min(45000, int(price)))  # ����b�X�z�d��
+        # �ͦ��W�ơ]�Ҽ{�P�����������ʡ^
+        base_area = 25 + (price - 22000) / 1000  # �����V���W�ƶV�j
+        area = base_area + np.random.normal(0, 3)  # �[�J�H���ܰ�
+        area = max(20, min(50, round(area, 1)))
+        # ��ܦa�}
+        address = np.random.choice(base_addresses) + f"{100 + i}��"
+        # ��ܼӼh
+        floor = np.random.choice(floors)
+        # �ͦ�����y�z
+        selected_keywords = np.random.choice(
+            keywords_pool,
+            size=np.random.randint(3, 7),
+            replace=False
+        )
+        description = f"{area}�W {floor} " + " ".join(selected_keywords)
+        # �ھڻ��浥�Žվ���D
+        if price >= 30000:
+            title_prefix = "��o����"
+        elif price >= 25000:
+            title_prefix = "�u�����"
+        else:
+            title_prefix = "��f�ξA"
+        mock_data.append({
+            'title': f'{title_prefix}2�йq��j��-���s���u�誫��{i+1:02d}',
+            'price': price,
+            'address': address,
+            'area': area,
+            'floor': floor,
+            'link': f'https://rent.591.com.tw/rent-detail-{12000+i}.html',
+            'raw_info': description,
+            'scraped_at': datetime.now().isoformat(),
+            'price_per_ping': round(price / area, 0)
+        })
+    return mock_data
+def generate_enhanced_rental_data(sample_size: int = 50) -> pd.DataFrame:
+    """
+    �ͦ��W�j�����θ��DataFrame
+    Args:
+        sample_size: �n�ͦ�����Ƶ���
+    Returns:
+        �]�t�B�~���R��쪺DataFrame
+    """
+    # �ͦ��򥻸��
+    raw_data = generate_mock_rental_data(sample_size)
+    df = pd.DataFrame(raw_data)
+    # �K�[�B�~���R���
+    # 1. �����϶�
+    df['price_range'] = pd.cut(
+        df['price'],
+        bins=[0, 20000, 25000, 30000, 35000, float('inf')],
+        labels=['<20K', '20-25K', '25-30K', '30-35K', '>35K']
+    )
+    # 2. �W�ư϶�
+    df['area_range'] = pd.cut(
+        df['area'],
+        bins=[0, 25, 30, 35, 40, float('inf')],
+        labels=['<25�W', '25-30�W', '30-35�W', '35-40�W', '>40�W']
+    )
+    # 3. �Ӽh���פ���
+    df['floor_level'] = df['floor'].apply(lambda x:
+        '�C�Ӽh' if int(x.replace('��', '')) <= 5 else
+        '���Ӽh' if int(x.replace('��', '')) <= 10 else
+        '���Ӽh'
+    )
+    # 4. ���󵥯š]��󯲪��^
+    df['property_grade'] = df['price'].apply(lambda x:
+        '����' if x >= 30000 else
+        '����' if x >= 25000 else
+        '�g��'
+    )
+    # 5. �ʻ�����С]���C�W�����^
+    price_per_ping_median = df['price_per_ping'].median()
+    df['value_rating'] = df['price_per_ping'].apply(lambda x:
+        '���ʻ���' if x < price_per_ping_median * 0.9 else
+        '����' if x < price_per_ping_median * 1.1 else
+        '����'
+    )
+    return df
+def get_market_summary_stats() -> Dict:
+    """
+    ��������K�n�έp
+    Returns:
+        �����έp�K�n�r��
+    """
+    # ����ڹ��s�ϥ����污���έp�ƾ�
+    return {
+        'market_name': '���������s��',
+        'property_type': '2�о�h�q��j��',
+        'avg_price_range': '22,000 - 35,000��',
+        'avg_area_range': '25 - 40�W',
+        'price_per_ping_range': '800 - 1,200��/�W',
+        'market_characteristics': [
+            '�F����N�]�B�R�e�����I',
+            '�ͬ����৹��',
+            '��q�K�Q�A�h���������u',
+            '���Ϻ޲z�}�n',
+            '�A�X�p�a�x�ηs�B�ҩd'
+        ],
+        'investment_highlights': [
+            '�a�q�u�V�A�O�ȩʨ�',
+            '���λݨDí�w',
+            '���ӵo�i��O�j',
+            '�ͬ��~���u�}'
+        ]
+    }
+if __name__ == "__main__":
+    # ���ո�ƥͦ�
+    print("�ͦ����ո��...")
+    # �ͦ��򥻸��
+    basic_data = generate_mock_rental_data(10)
+    print(f"�ͦ� {len(basic_data)} ���򥻸��")
+    print("�d�Ҹ��:")
+    print(basic_data[0])
+    # �ͦ��W�j���
+    enhanced_df = generate_enhanced_rental_data(10)
+    print(f"\n�W�j������: {list(enhanced_df.columns)}")
+    print("\n�W�j��Ʋέp:")
+    print(enhanced_df[['price', 'area', 'price_per_ping']].describe())
+    # �����K�n
+    market_stats = get_market_summary_stats()
+    print(f"\n�����K�n:")
+    print(f"�ؼХ���: {market_stats['market_name']}")
+    print(f"��������: {market_stats['property_type']}")
+    print(f"����d��: {market_stats['avg_price_range']}")

gradio_app.py ADDED Viewed

	@@ -0,0 +1,347 @@

+# �� Copilot �ͦ�
+import gradio as gr
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import json
+from datetime import datetime
+from rental_analyzer import RentalAnalyzer
+from data_generator import generate_mock_rental_data, get_market_summary_stats
+# �]�w����r��
+plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS', 'SimHei']
+plt.rcParams['axes.unicode_minus'] = False
+def analyze_rental_data(sample_size, use_hf_models):
+    """���毲�Τ��R���D���"""
+    try:
+        # �B�J1: �ͦ��������
+        progress_info = "? ���b�ͦ����R���..."
+        data = generate_mock_rental_data(int(sample_size))
+        df = pd.DataFrame(data)
+        # �B�J2: ������R
+        progress_info = "? ���b����έp���R..."
+        analyzer = RentalAnalyzer(df, use_hf_models=use_hf_models)
+        results = analyzer.run_analysis()
+        # �B�J3: �ͦ����i
+        progress_info = "? ���b�ͦ����R���i..."
+        # �򥻲έp���i
+        report = generate_text_report(results)
+        # �ͦ��Ϫ�
+        charts = create_analysis_charts(df, results)
+        # ��ƪ���
+        display_df = df[['title', 'price', 'area', 'price_per_ping', 'address']].head(10)
+        return (
+            report,
+            charts['price_distribution'],
+            charts['area_vs_price'],
+            charts['price_range_pie'],
+            charts['keywords_bar'],
+            display_df
+        )
+    except Exception as e:
+        error_msg = f"���R�L�{���o�Ϳ��~: {str(e)}"
+        empty_fig = px.scatter(title="�L���")
+        empty_df = pd.DataFrame()
+        return (
+            error_msg,
+            empty_fig,
+            empty_fig,
+            empty_fig,
+            empty_fig,
+            empty_df
+        )
+def generate_text_report(results):
+    """�ͦ���r���i"""
+    report = """
+# ? ���������s�ϯ��Υ������R���i
+**���R�ɶ�**: {analysis_time}
+**��ƨӷ�**: 591���κ��������
+## ? �������p
+- **�`�����**: {total_properties} ��
+- **���R�d��**: ���������s�� 2�о�h�q��j��
+## ? �����έp���R
+- **��������**: {mean_price:,} ��
+- **���������**: {median_price:,} ��
+- **�����зǮt**: {std_price:,} ��
+- **�����d��**: {min_price:,} - {max_price:,} ��
+- **�Ĥ@�|�����**: {q25_price:,} ��
+- **�ĤT�|�����**: {q75_price:,} ��
+## ? �W�Ʋέp���R
+- **�����W��**: {mean_area:.1f} �W
+- **�W�Ƥ����**: {median_area:.1f} �W
+- **�W�ƽd��**: {min_area:.1f} - {max_area:.1f} �W
+## ? �C�W�������R
+- **�����C�W����**: {mean_ppp:,} ��/�W
+- **�C�W���������**: {median_ppp:,} ��/�W
+- **�C�W�����d��**: {min_ppp:,} - {max_ppp:,} ��/�W
+## ? �����}��
+{insights}
+## ? ����ĳ
+1. ���s��2�йq��j�ӯ������Ǹ���í�w
+2. ��ĳ�����w��]�w�b {q25_price:,} - {q75_price:,} ���϶�
+3. �C�W�������b {ppp_range} ��/�W�d��
+4. ��ĳ�M��30�W���k������A�ŦX�����D�y�ݨD
+5. ���s�ϾF����N�]�B�R�e�����I�A�㦳�}�n���ͬ�����
+---
+*�����i�� Hugging Face Spaces �۰ʥͦ�*
+    """.format(
+        analysis_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        total_properties=results['basic_stats']['total_properties'],
+        mean_price=results['basic_stats']['price_stats']['mean'],
+        median_price=results['basic_stats']['price_stats']['median'],
+        std_price=results['basic_stats']['price_stats']['std'],
+        min_price=results['basic_stats']['price_stats']['min'],
+        max_price=results['basic_stats']['price_stats']['max'],
+        q25_price=results['basic_stats']['price_stats']['q25'],
+        q75_price=results['basic_stats']['price_stats']['q75'],
+        mean_area=results['basic_stats']['area_stats']['mean'],
+        median_area=results['basic_stats']['area_stats']['median'],
+        min_area=results['basic_stats']['area_stats']['min'],
+        max_area=results['basic_stats']['area_stats']['max'],
+        mean_ppp=results['basic_stats']['price_per_ping_stats']['mean'],
+        median_ppp=results['basic_stats']['price_per_ping_stats']['median'],
+        min_ppp=results['basic_stats']['price_per_ping_stats']['min'],
+        max_ppp=results['basic_stats']['price_per_ping_stats']['max'],
+        ppp_range=f"{int(results['basic_stats']['price_per_ping_stats']['min'])} - {int(results['basic_stats']['price_per_ping_stats']['max'])}",
+        insights="\n".join([f"{i+1}. {insight}" for i, insight in enumerate(results.get('insights', []))])
+    )
+    return report
+def create_analysis_charts(df, results):
+    """�Ыؤ��R�Ϫ�"""
+    charts = {}
+    # 1. ����������
+    charts['price_distribution'] = px.histogram(
+        df,
+        x='price',
+        nbins=20,
+        title='����������',
+        labels={'price': '���� (��)', 'count': '����ƶq'},
+        color_discrete_sequence=['skyblue']
+    )
+    charts['price_distribution'].update_layout(
+        xaxis_title="���� (��)",
+        yaxis_title="����ƶq"
+    )
+    # 2. �W��vs�������I��
+    charts['area_vs_price'] = px.scatter(
+        df,
+        x='area',
+        y='price',
+        hover_data=['title'],
+        title='�W�ƻP�������Y',
+        labels={'area': '�W��', 'price': '���� (��)'},
+        color_discrete_sequence=['coral']
+    )
+    # �K�[�Ͷսu
+    z = np.polyfit(df['area'], df['price'], 1)
+    line_x = [df['area'].min(), df['area'].max()]
+    line_y = [z[0] * x + z[1] for x in line_x]
+    charts['area_vs_price'].add_trace(
+        go.Scatter(
+            x=line_x,
+            y=line_y,
+            mode='lines',
+            name='�Ͷսu',
+            line=dict(color='red', dash='dash')
+        )
+    )
+    # 3. �����϶�����
+    price_dist = df['price_range'].value_counts()
+    charts['price_range_pie'] = px.pie(
+        values=price_dist.values,
+        names=price_dist.index,
+        title='�����϶�����',
+        color_discrete_sequence=px.colors.qualitative.Set3
+    )
+    # 4. ����r���R������
+    if 'keywords_analysis' in results and results['keywords_analysis']:
+        keywords_data = results['keywords_analysis']
+        filtered_keywords = {k: v for k, v in keywords_data.items() if v > 0}
+        if filtered_keywords:
+            charts['keywords_bar'] = px.bar(
+                x=list(filtered_keywords.values()),
+                y=list(filtered_keywords.keys()),
+                orientation='h',
+                title='����y�z����r�W�v',
+                labels={'x': '�X�{����', 'y': '����r'},
+                color_discrete_sequence=['lightcoral']
+            )
+        else:
+            charts['keywords_bar'] = px.bar(title="�L����r���")
+    else:
+        charts['keywords_bar'] = px.bar(title="�L����r���")
+    return charts
+# �Ы�Gradio����
+def create_interface():
+    """�Ы�Gradio�ϥΪ̤���"""
+    with gr.Blocks(
+        title="591���Τ��R�� - ���������s��",
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header { text-align: center; color: #2E86AB; }
+        .info-box { background-color: #f0f8ff; padding: 15px; border-radius: 10px; }
+        """
+    ) as demo:
+        # ���D
+        gr.Markdown(
+            """
+            # ? 591���Τ��R�� - ���������s��
+            ### �M�~���Υ������R�u�� | ��X Hugging Face �ͺA�t��
+            ���R�ؼСG**���������s��** | **2�о�h�q��j��**
+            """,
+            elem_classes=["main-header"]
+        )
+        # �\�໡��
+        with gr.Row():
+            gr.Markdown(
+                """
+                <div class="info-box">
+                ### ? ���R�\��
+                - ? **�����έp**: �����ȡB����ơB�������R
+                - ? **�W�Ƥ��R**: �W�ƻP�������Y���Q
+                - ? **�ʻ���**: �C�W�����έp���R
+                - ? **�����Ͷ�**: �����϶������Ϫ�
+                - ? **��r���R**: ����y�z����r����
+                - ? **AI�ҫ�**: ��XHugging Face�۵M�y���B�z
+                </div>
+                """,
+                elem_classes=["info-box"]
+            )
+        # ����O
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### ?? ���R�]�w")
+                sample_size = gr.Slider(
+                    minimum=30,
+                    maximum=100,
+                    value=50,
+                    step=10,
+                    label="? ��Ƶ���",
+                    info="���R�����Ϊ���ƶq"
+                )
+                use_hf_models = gr.Checkbox(
+                    value=True,
+                    label="? �ϥ� Hugging Face �ҫ�",
+                    info="�ҥ�AI��r���R�\��"
+                )
+                analyze_btn = gr.Button(
+                    "? �}�l���R",
+                    variant="primary",
+                    size="lg"
+                )
+        # ���G��ܰϰ�
+        gr.Markdown("---")
+        gr.Markdown("## ? ���R���G")
+        with gr.Tabs():
+            # ���R���i����
+            with gr.Tab("? ���R���i"):
+                report_output = gr.Markdown()
+            # ��ı�ƹϪ�����
+            with gr.Tab("? ��ı�ƹϪ�"):
+                with gr.Row():
+                    price_dist_plot = gr.Plot(label="����������")
+                    area_price_plot = gr.Plot(label="�W�ƻP�������Y")
+                with gr.Row():
+                    price_pie_plot = gr.Plot(label="�����϶�����")
+                    keywords_plot = gr.Plot(label="����r���R")
+            # ��ƪ��歶��
+            with gr.Tab("? ��Ƥ@��"):
+                data_table = gr.Dataframe(
+                    headers=["����W��", "����", "�W��", "�C�W����", "�a�}"],
+                    label="���θ�ƪ� (�e10��)",
+                    interactive=False
+                )
+        # �]�w���s�ƥ�
+        analyze_btn.click(
+            fn=analyze_rental_data,
+            inputs=[sample_size, use_hf_models],
+            outputs=[
+                report_output,
+                price_dist_plot,
+                area_price_plot,
+                price_pie_plot,
+                keywords_plot,
+                data_table
+            ]
+        )
+        # ������T
+        gr.Markdown(
+            """
+            ---
+            ### ? �ϥλ���
+            1. �վ���R�Ѽơ]��Ƶ��ơBAI�ҫ��ﶵ�^
+            2. �I���u�}�l���R�v���s
+            3. �d�ݤ��R���i�B�Ϫ��M��ƪ���
+            4. �Ҧ����R���G��������ơA�Ȩѥܽd�ϥ�
+            ### ?? �`�N�ƶ�
+            - ��ƨӷ��������ͦ��A�Ω�i�ܤ��R�\��
+            - ��ڳ��p�ɥi�걵�u�ꪺ591���κ�API
+            - �ϥ�Hugging Face�ҫ��i��ݭn�����B�z�ɶ�
+            **? �� Hugging Face Spaces ���Ѥ䴩 | �ϥ� GitHub Copilot �ͦ�**
+            """
+        )
+    return demo
+# �D�{��
+if __name__ == "__main__":
+    # �פJnumpy�]�ץ����e����|�^
+    import numpy as np
+    demo = create_interface()
+    demo.launch()

main.py CHANGED Viewed

@@ -5,6 +5,12 @@
 ���{����X�F�������ΡB��Ƥ��R�M��ı�ƥ\��A
 �M���Ω���R591���κ������θ�ơC
 """
 import os

 ���{����X�F�������ΡB��Ƥ��R�M��ı�ƥ\��A
 �M���Ω���R591���κ������θ�ơC
+�j�M����G
+- �a�ϡG���������s�� (region=17&section=247)
+- �Ы��G2�� (layout=2)
+- �����G��h���a (kind=1)
+- �ؿv�G�q��j�� (shape=2)
 """
 import os

rental_analyzer.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# �� Copilot �ͦ�
+import pandas as pd
+import numpy as np
+from typing import Dict, List
+import json
+from transformers import pipeline
+from datasets import Dataset
+class RentalAnalyzer:
+    """���θ�Ƥ��R�� - Hugging Face Spaces����"""
+    def __init__(self, df: pd.DataFrame, use_hf_models: bool = True):
+        """
+        ��l�Ƥ��R��
+        Args:
+            df: ���θ��DataFrame
+            use_hf_models: �O�_�ϥ�Hugging Face�ҫ�
+        """
+        self.df = df.copy()
+        self.use_hf_models = use_hf_models
+        self.analysis_results = {}
+        # ��l��Hugging Face�ҫ�
+        self.sentiment_analyzer = None
+        if use_hf_models:
+            try:
+                # ���J���屡�P���R�ҫ�
+                self.sentiment_analyzer = pipeline(
+                    "sentiment-analysis",
+                    model="ckiplab/bert-base-chinese",
+                    return_all_scores=True
+                )
+            except Exception as e:
+                print(f"Warning: Could not load Hugging Face model: {e}")
+                self.use_hf_models = False
+    def clean_data(self) -> pd.DataFrame:
+        """�M�~���"""
+        # �������Ƹ��
+        original_count = len(self.df)
+        self.df = self.df.drop_duplicates(subset=['title', 'address', 'price'])
+        # �B�z�������
+        self.df['price'] = pd.to_numeric(self.df['price'], errors='coerce')
+        self.df = self.df[self.df['price'] > 0]
+        # �B�z�W�Ƹ��
+        self.df['area'] = pd.to_numeric(self.df['area'], errors='coerce')
+        self.df = self.df[self.df['area'] > 0]
+        # �p��C�W����
+        self.df['price_per_ping'] = self.df['price'] / self.df['area']
+        # �������`��
+        self.df = self.remove_outliers(self.df, 'price')
+        # �K�[�������
+        self.add_categorical_columns()
+        return self.df
+    def remove_outliers(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
+        """�������`�ȡ]�ϥ�IQR��k�^"""
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
+    def add_categorical_columns(self):
+        """�K�[�������"""
+        # �����϶�
+        self.df['price_range'] = pd.cut(
+            self.df['price'],
+            bins=[0, 20000, 25000, 30000, 35000, float('inf')],
+            labels=['<20K', '20-25K', '25-30K', '30-35K', '>35K']
+        )
+        # �W�ư϶�
+        self.df['area_range'] = pd.cut(
+            self.df['area'],
+            bins=[0, 25, 30, 35, 40, float('inf')],
+            labels=['<25�W', '25-30�W', '30-35�W', '35-40�W', '>40�W']
+        )
+    def basic_statistics(self) -> Dict:
+        """�򥻲έp���R"""
+        stats = {
+            'total_properties': len(self.df),
+            'price_stats': {
+                'mean': round(self.df['price'].mean(), 2),
+                'median': round(self.df['price'].median(), 2),
+                'std': round(self.df['price'].std(), 2),
+                'min': int(self.df['price'].min()),
+                'max': int(self.df['price'].max()),
+                'q25': round(self.df['price'].quantile(0.25), 2),
+                'q75': round(self.df['price'].quantile(0.75), 2)
+            },
+            'area_stats': {
+                'mean': round(self.df['area'].mean(), 2),
+                'median': round(self.df['area'].median(), 2),
+                'min': round(self.df['area'].min(), 1),
+                'max': round(self.df['area'].max(), 1)
+            },
+            'price_per_ping_stats': {
+                'mean': round(self.df['price_per_ping'].mean(), 2),
+                'median': round(self.df['price_per_ping'].median(), 2),
+                'min': round(self.df['price_per_ping'].min(), 2),
+                'max': round(self.df['price_per_ping'].max(), 2)
+            }
+        }
+        return stats
+    def price_distribution_analysis(self) -> Dict:
+        """�����������R"""
+        distribution = self.df['price_range'].value_counts().sort_index()
+        return distribution.to_dict()
+    def area_distribution_analysis(self) -> Dict:
+        """�W�Ƥ������R"""
+        distribution = self.df['area_range'].value_counts().sort_index()
+        return distribution.to_dict()
+    def keywords_analysis(self) -> Dict:
+        """����r���R"""
+        # �w�q�Ыά�������r
+        keywords = [
+            '�񱶹B', '�񨮯�', '�q��', '���x', '������', '�޲z�O',
+            '�ĥ�', '�q��', '�w�R', '�K�Q', '�ͬ�����', '�ǰ�',
+            '���s', '���C', '�a��', '�a�q', '�N��', '�~���',
+            '���N�]', '�R�e', '��G', '��l�W', '���s', '�����P'
+        ]
+        keyword_counts = {keyword: 0 for keyword in keywords}
+        descriptions = self.df['raw_info'].dropna().tolist()
+        for desc in descriptions:
+            for keyword in keywords:
+                if keyword in str(desc):
+                    keyword_counts[keyword] += 1
+        # �ƧǨè��e10��
+        sorted_keywords = dict(
+            sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
+        )
+        return sorted_keywords
+    def huggingface_analysis(self) -> Dict:
+        """�ϥ�Hugging Face�ҫ��i����R"""
+        if not self.use_hf_models or self.sentiment_analyzer is None:
+            return {}
+        try:
+            descriptions = self.df['raw_info'].dropna().tolist()[:10]  # ���e10���קK�W��
+            if not descriptions:
+                return {}
+            # ���P���R
+            sentiments = []
+            for desc in descriptions:
+                try:
+                    result = self.sentiment_analyzer(desc[:100])  # �������
+                    sentiments.append(result[0]['label'] if result else 'NEUTRAL')
+                except:
+                    sentiments.append('NEUTRAL')
+            # �έp���P����
+            sentiment_counts = {}
+            for sentiment in sentiments:
+                sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
+            # �Ы�Dataset
+            hf_dataset = Dataset.from_dict({
+                'text': descriptions,
+                'price': self.df['price'].head(len(descriptions)).tolist(),
+                'area': self.df['area'].head(len(descriptions)).tolist(),
+                'sentiment': sentiments
+            })
+            return {
+                'sentiment_distribution': sentiment_counts,
+                'dataset_size': len(hf_dataset),
+                'sample_analysis': True
+            }
+        except Exception as e:
+            print(f"Hugging Face analysis error: {e}")
+            return {}
+    def correlation_analysis(self) -> Dict:
+        """�����ʤ��R"""
+        numeric_columns = ['price', 'area', 'price_per_ping']
+        available_columns = [
+            col for col in numeric_columns
+            if col in self.df.columns and not self.df[col].isna().all()
+        ]
+        if len(available_columns) < 2:
+            return {}
+        correlation_matrix = self.df[available_columns].corr()
+        correlations = {}
+        for i, col1 in enumerate(available_columns):
+            for j, col2 in enumerate(available_columns):
+                if i < j:  # �קK����
+                    correlations[f"{col1}_vs_{col2}"] = round(
+                        correlation_matrix.loc[col1, col2], 3
+                    )
+        return correlations
+    def generate_insights(self) -> List[str]:
+        """�ͦ����R�}��"""
+        insights = []
+        # �򥻲έp�}��
+        if 'basic_stats' in self.analysis_results:
+            stats = self.analysis_results['basic_stats']
+            insights.append(f"�@��� {stats['total_properties']} ���ŦX���󪺯��Ϊ���")
+            insights.append(f"���������� {stats['price_stats']['mean']:,} ��")
+            insights.append(f"��������Ƭ� {stats['price_stats']['median']:,} ��")
+            if stats['price_stats']['mean'] > stats['price_stats']['median']:
+                insights.append("���������V�k���סA�s�b����������԰�������")
+        # �������R�}��
+        if 'price_distribution' in self.analysis_results:
+            dist = self.analysis_results['price_distribution']
+            if dist:
+                most_common_range = max(dist, key=dist.get)
+                count = dist[most_common_range]
+                percentage = (count / self.analysis_results['basic_stats']['total_properties']) * 100
+                insights.append(f"�̱`���������϶��O {most_common_range}�A�� {percentage:.1f}%")
+        # Hugging Face���R�}��
+        if 'hf_analysis' in self.analysis_results and self.analysis_results['hf_analysis']:
+            hf_results = self.analysis_results['hf_analysis']
+            if 'sentiment_distribution' in hf_results:
+                insights.append("�w�ϥ�Hugging Face�ҫ��i�污�P���R")
+        return insights
+    def run_analysis(self) -> Dict:
+        """���槹����R"""
+        # �M�~���
+        self.clean_data()
+        # �򥻲έp
+        self.analysis_results['basic_stats'] = self.basic_statistics()
+        # �������R
+        self.analysis_results['price_distribution'] = self.price_distribution_analysis()
+        self.analysis_results['area_distribution'] = self.area_distribution_analysis()
+        # ����r���R
+        self.analysis_results['keywords_analysis'] = self.keywords_analysis()
+        # �����ʤ��R
+        self.analysis_results['correlation'] = self.correlation_analysis()
+        # Hugging Face���R
+        if self.use_hf_models:
+            self.analysis_results['hf_analysis'] = self.huggingface_analysis()
+        # �ͦ��}��
+        self.analysis_results['insights'] = self.generate_insights()
+        return self.analysis_results

requirements.txt CHANGED Viewed

@@ -1,14 +1,13 @@
-# �� Copilot �ͦ�
-requests>=2.31.0
-beautifulsoup4>=4.12.0
 pandas>=2.0.0
 numpy>=1.24.0
 matplotlib>=3.7.0
 seaborn>=0.12.0
 transformers>=4.30.0
 datasets>=2.14.0
-plotly>=5.15.0
-jupyter>=1.0.0
-lxml>=4.9.0
-selenium>=4.10.0
-webdriver-manager>=3.8.0

+# �� Copilot �ͦ� - Hugging Face Spaces �ۮe����
+streamlit>=1.28.0
+gradio>=3.50.0
 pandas>=2.0.0
 numpy>=1.24.0
 matplotlib>=3.7.0
 seaborn>=0.12.0
+plotly>=5.15.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
 transformers>=4.30.0
 datasets>=2.14.0
+scikit-learn>=1.3.0