Spaces:
Paused
Paused
Upload 7 files
Browse files- 591_rental_analysis.ipynb +10 -0
- analyzer.py +373 -0
- main.py +179 -0
- requirements.txt +14 -0
- scraper.py +253 -0
- utils.py +150 -0
- visualizer.py +402 -0
591_rental_analysis.ipynb
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [],
|
| 3 |
+
"metadata": {
|
| 4 |
+
"language_info": {
|
| 5 |
+
"name": "python"
|
| 6 |
+
}
|
| 7 |
+
},
|
| 8 |
+
"nbformat": 4,
|
| 9 |
+
"nbformat_minor": 5
|
| 10 |
+
}
|
analyzer.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
import json
|
| 6 |
+
from transformers import pipeline, AutoTokenizer, AutoModel
|
| 7 |
+
from datasets import Dataset
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
class RentalDataAnalyzer:
|
| 11 |
+
"""���θ�Ƥ��R��"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, data_path: str = None):
|
| 14 |
+
"""
|
| 15 |
+
��l�Ƥ��R��
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
data_path: ����ɮ��|
|
| 19 |
+
"""
|
| 20 |
+
self.data_path = data_path
|
| 21 |
+
self.df = None
|
| 22 |
+
self.analysis_results = {}
|
| 23 |
+
|
| 24 |
+
# ��l��Hugging Face�ҫ��Ω��r���R
|
| 25 |
+
self.sentiment_analyzer = None
|
| 26 |
+
self.text_classifier = None
|
| 27 |
+
|
| 28 |
+
def load_data(self, data_path: str = None) -> pd.DataFrame:
|
| 29 |
+
"""���J���"""
|
| 30 |
+
if data_path:
|
| 31 |
+
self.data_path = data_path
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
if self.data_path.endswith('.json'):
|
| 35 |
+
with open(self.data_path, 'r', encoding='utf-8') as f:
|
| 36 |
+
data = json.load(f)
|
| 37 |
+
self.df = pd.DataFrame(data)
|
| 38 |
+
elif self.data_path.endswith('.csv'):
|
| 39 |
+
self.df = pd.read_csv(self.data_path, encoding='utf-8-sig')
|
| 40 |
+
else:
|
| 41 |
+
raise ValueError("���䴩���ɮ榡")
|
| 42 |
+
|
| 43 |
+
print(f"���\���J {len(self.df)} �����")
|
| 44 |
+
return self.df
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"���J��Ʈɵo�Ϳ��~: {e}")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
def clean_data(self) -> pd.DataFrame:
|
| 51 |
+
"""�M�~���"""
|
| 52 |
+
if self.df is None:
|
| 53 |
+
print("�����J���")
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
print("�}�l�M�~���...")
|
| 57 |
+
|
| 58 |
+
# �������Ƹ��
|
| 59 |
+
original_count = len(self.df)
|
| 60 |
+
self.df = self.df.drop_duplicates(subset=['title', 'address', 'price'])
|
| 61 |
+
print(f"���� {original_count - len(self.df)} �����Ƹ��")
|
| 62 |
+
|
| 63 |
+
# �M�z�������
|
| 64 |
+
self.df['price'] = pd.to_numeric(self.df['price'], errors='coerce')
|
| 65 |
+
self.df = self.df[self.df['price'] > 0] # �����L���
|
| 66 |
+
|
| 67 |
+
# �M�z�W�Ƹ��
|
| 68 |
+
self.df['area'] = pd.to_numeric(self.df['area'], errors='coerce')
|
| 69 |
+
|
| 70 |
+
# �p��C�W����
|
| 71 |
+
self.df['price_per_ping'] = self.df.apply(
|
| 72 |
+
lambda row: row['price'] / row['area'] if row['area'] > 0 else np.nan,
|
| 73 |
+
axis=1
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# �������`�ȡ]�ϥ�IQR��k�^
|
| 77 |
+
self.df = self.remove_outliers(self.df, 'price')
|
| 78 |
+
|
| 79 |
+
print(f"�M�~��Ѿl {len(self.df)} �����ĸ��")
|
| 80 |
+
return self.df
|
| 81 |
+
|
| 82 |
+
def remove_outliers(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
|
| 83 |
+
"""�������`��"""
|
| 84 |
+
Q1 = df[column].quantile(0.25)
|
| 85 |
+
Q3 = df[column].quantile(0.75)
|
| 86 |
+
IQR = Q3 - Q1
|
| 87 |
+
|
| 88 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 89 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 90 |
+
|
| 91 |
+
outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
|
| 92 |
+
print(f"���� {outliers_count} �� {column} ���`��")
|
| 93 |
+
|
| 94 |
+
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
|
| 95 |
+
|
| 96 |
+
def basic_statistics(self) -> Dict:
|
| 97 |
+
"""�έp���R"""
|
| 98 |
+
if self.df is None or len(self.df) == 0:
|
| 99 |
+
return {}
|
| 100 |
+
|
| 101 |
+
stats = {
|
| 102 |
+
'total_properties': len(self.df),
|
| 103 |
+
'price_stats': {
|
| 104 |
+
'mean': round(self.df['price'].mean(), 2),
|
| 105 |
+
'median': round(self.df['price'].median(), 2),
|
| 106 |
+
'std': round(self.df['price'].std(), 2),
|
| 107 |
+
'min': self.df['price'].min(),
|
| 108 |
+
'max': self.df['price'].max(),
|
| 109 |
+
'q25': round(self.df['price'].quantile(0.25), 2),
|
| 110 |
+
'q75': round(self.df['price'].quantile(0.75), 2)
|
| 111 |
+
},
|
| 112 |
+
'area_stats': {
|
| 113 |
+
'mean': round(self.df['area'].mean(), 2),
|
| 114 |
+
'median': round(self.df['area'].median(), 2),
|
| 115 |
+
'min': self.df['area'].min(),
|
| 116 |
+
'max': self.df['area'].max()
|
| 117 |
+
} if not self.df['area'].isna().all() else {},
|
| 118 |
+
'price_per_ping_stats': {
|
| 119 |
+
'mean': round(self.df['price_per_ping'].mean(), 2),
|
| 120 |
+
'median': round(self.df['price_per_ping'].median(), 2),
|
| 121 |
+
'min': round(self.df['price_per_ping'].min(), 2),
|
| 122 |
+
'max': round(self.df['price_per_ping'].max(), 2)
|
| 123 |
+
} if not self.df['price_per_ping'].isna().all() else {}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
self.analysis_results['basic_stats'] = stats
|
| 127 |
+
return stats
|
| 128 |
+
|
| 129 |
+
def price_distribution_analysis(self) -> Dict:
|
| 130 |
+
"""�����������R"""
|
| 131 |
+
if self.df is None or len(self.df) == 0:
|
| 132 |
+
return {}
|
| 133 |
+
|
| 134 |
+
# �w�q�����϶�
|
| 135 |
+
price_bins = [0, 15000, 20000, 25000, 30000, 40000, float('inf')]
|
| 136 |
+
price_labels = ['<15K', '15-20K', '20-25K', '25-30K', '30-40K', '>40K']
|
| 137 |
+
|
| 138 |
+
self.df['price_range'] = pd.cut(self.df['price'], bins=price_bins, labels=price_labels, right=False)
|
| 139 |
+
|
| 140 |
+
distribution = self.df['price_range'].value_counts().sort_index()
|
| 141 |
+
|
| 142 |
+
distribution_dict = {
|
| 143 |
+
'ranges': distribution.index.tolist(),
|
| 144 |
+
'counts': distribution.values.tolist(),
|
| 145 |
+
'percentages': (distribution / len(self.df) * 100).round(2).tolist()
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
self.analysis_results['price_distribution'] = distribution_dict
|
| 149 |
+
return distribution_dict
|
| 150 |
+
|
| 151 |
+
def area_analysis(self) -> Dict:
|
| 152 |
+
"""�W�Ƥ��R"""
|
| 153 |
+
if self.df is None or len(self.df) == 0 or self.df['area'].isna().all():
|
| 154 |
+
return {}
|
| 155 |
+
|
| 156 |
+
# �w�q�W�ư϶�
|
| 157 |
+
area_bins = [0, 20, 30, 40, 50, float('inf')]
|
| 158 |
+
area_labels = ['<20�W', '20-30�W', '30-40�W', '40-50�W', '>50�W']
|
| 159 |
+
|
| 160 |
+
self.df['area_range'] = pd.cut(self.df['area'], bins=area_bins, labels=area_labels, right=False)
|
| 161 |
+
|
| 162 |
+
area_distribution = self.df['area_range'].value_counts().sort_index()
|
| 163 |
+
|
| 164 |
+
area_dict = {
|
| 165 |
+
'ranges': area_distribution.index.tolist(),
|
| 166 |
+
'counts': area_distribution.values.tolist(),
|
| 167 |
+
'percentages': (area_distribution / len(self.df) * 100).round(2).tolist()
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
self.analysis_results['area_analysis'] = area_dict
|
| 171 |
+
return area_dict
|
| 172 |
+
|
| 173 |
+
def setup_huggingface_models(self):
|
| 174 |
+
"""�]�mHugging Face�ҫ�"""
|
| 175 |
+
try:
|
| 176 |
+
print("���JHugging Face�ҫ�...")
|
| 177 |
+
|
| 178 |
+
# ���J���屡�P���R�ҫ�
|
| 179 |
+
self.sentiment_analyzer = pipeline(
|
| 180 |
+
"sentiment-analysis",
|
| 181 |
+
model="ckiplab/bert-base-chinese-ws",
|
| 182 |
+
return_all_scores=True
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
print("Hugging Face�ҫ����J����")
|
| 186 |
+
except Exception as e:
|
| 187 |
+
print(f"���JHugging Face�ҫ��ɵo�Ϳ��~: {e}")
|
| 188 |
+
|
| 189 |
+
def analyze_descriptions(self) -> Dict:
|
| 190 |
+
"""���R����y�z��r"""
|
| 191 |
+
if self.df is None or 'raw_info' not in self.df.columns:
|
| 192 |
+
return {}
|
| 193 |
+
|
| 194 |
+
descriptions = self.df['raw_info'].dropna().tolist()
|
| 195 |
+
|
| 196 |
+
if not descriptions:
|
| 197 |
+
return {}
|
| 198 |
+
|
| 199 |
+
# ����r���R
|
| 200 |
+
keywords_analysis = self.analyze_keywords(descriptions)
|
| 201 |
+
|
| 202 |
+
analysis_result = {
|
| 203 |
+
'keywords_frequency': keywords_analysis,
|
| 204 |
+
'total_descriptions': len(descriptions)
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
self.analysis_results['description_analysis'] = analysis_result
|
| 208 |
+
return analysis_result
|
| 209 |
+
|
| 210 |
+
def analyze_keywords(self, descriptions: List[str]) -> Dict:
|
| 211 |
+
"""���R����r�W�v"""
|
| 212 |
+
# �w�q�Ыά�������r
|
| 213 |
+
keywords = [
|
| 214 |
+
'�B', '��', '�q��', '���x', '������', '�z�O',
|
| 215 |
+
'�ĥ�', '�q��', '�w�R', '�K�Q', '�ͬ�����', '�ǰ�',
|
| 216 |
+
'���s', '���C', '�a��', '�a�q', '�N��', '�~���'
|
| 217 |
+
]
|
| 218 |
+
|
| 219 |
+
keyword_counts = {keyword: 0 for keyword in keywords}
|
| 220 |
+
|
| 221 |
+
for desc in descriptions:
|
| 222 |
+
for keyword in keywords:
|
| 223 |
+
if keyword in desc:
|
| 224 |
+
keyword_counts[keyword] += 1
|
| 225 |
+
|
| 226 |
+
# �ƧǨè��e10��
|
| 227 |
+
sorted_keywords = dict(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10])
|
| 228 |
+
|
| 229 |
+
return sorted_keywords
|
| 230 |
+
|
| 231 |
+
def correlation_analysis(self) -> Dict:
|
| 232 |
+
"""�����ʤ��R"""
|
| 233 |
+
if self.df is None or len(self.df) == 0:
|
| 234 |
+
return {}
|
| 235 |
+
|
| 236 |
+
numeric_columns = ['price', 'area', 'price_per_ping']
|
| 237 |
+
available_columns = [col for col in numeric_columns if col in self.df.columns and not self.df[col].isna().all()]
|
| 238 |
+
|
| 239 |
+
if len(available_columns) < 2:
|
| 240 |
+
return {}
|
| 241 |
+
|
| 242 |
+
correlation_matrix = self.df[available_columns].corr()
|
| 243 |
+
|
| 244 |
+
correlation_dict = {}
|
| 245 |
+
for i, col1 in enumerate(available_columns):
|
| 246 |
+
for j, col2 in enumerate(available_columns):
|
| 247 |
+
if i < j: # �קK����
|
| 248 |
+
correlation_dict[f"{col1}_vs_{col2}"] = round(correlation_matrix.loc[col1, col2], 3)
|
| 249 |
+
|
| 250 |
+
self.analysis_results['correlation'] = correlation_dict
|
| 251 |
+
return correlation_dict
|
| 252 |
+
|
| 253 |
+
def generate_insights(self) -> List[str]:
|
| 254 |
+
"""�ͦ����R�}��"""
|
| 255 |
+
insights = []
|
| 256 |
+
|
| 257 |
+
if 'basic_stats' in self.analysis_results:
|
| 258 |
+
stats = self.analysis_results['basic_stats']
|
| 259 |
+
insights.append(f"�@��� {stats['total_properties']} ���ŦX�����Ϊ���")
|
| 260 |
+
insights.append(f"���������� {stats['price_stats']['mean']:,} ��")
|
| 261 |
+
insights.append(f"��������Ƭ� {stats['price_stats']['median']:,} ��")
|
| 262 |
+
|
| 263 |
+
if stats['price_stats']['mean'] > stats['price_stats']['median']:
|
| 264 |
+
insights.append("���������V�k���סA�s�b�����������������")
|
| 265 |
+
|
| 266 |
+
if 'price_distribution' in self.analysis_results:
|
| 267 |
+
dist = self.analysis_results['price_distribution']
|
| 268 |
+
max_range_idx = dist['percentages'].index(max(dist['percentages']))
|
| 269 |
+
most_common_range = dist['ranges'][max_range_idx]
|
| 270 |
+
percentage = dist['percentages'][max_range_idx]
|
| 271 |
+
insights.append(f"�̱`���������϶��O {most_common_range}�A�� {percentage}%")
|
| 272 |
+
|
| 273 |
+
if 'area_analysis' in self.analysis_results:
|
| 274 |
+
area = self.analysis_results['area_analysis']
|
| 275 |
+
if area:
|
| 276 |
+
max_area_idx = area['percentages'].index(max(area['percentages']))
|
| 277 |
+
most_common_area = area['ranges'][max_area_idx]
|
| 278 |
+
insights.append(f"�̱`�����W�ư϶��O {most_common_area}")
|
| 279 |
+
|
| 280 |
+
return insights
|
| 281 |
+
|
| 282 |
+
def run_full_analysis(self) -> Dict:
|
| 283 |
+
"""���槹����R"""
|
| 284 |
+
print("�}�l���槹����R...")
|
| 285 |
+
|
| 286 |
+
# �έp
|
| 287 |
+
basic_stats = self.basic_statistics()
|
| 288 |
+
print("? �έp���R����")
|
| 289 |
+
|
| 290 |
+
# �����������R
|
| 291 |
+
price_dist = self.price_distribution_analysis()
|
| 292 |
+
print("? �����������R����")
|
| 293 |
+
|
| 294 |
+
# �W�Ƥ��R
|
| 295 |
+
area_analysis = self.area_analysis()
|
| 296 |
+
print("? �W�Ƥ��R����")
|
| 297 |
+
|
| 298 |
+
# �y�z��r���R
|
| 299 |
+
desc_analysis = self.analyze_descriptions()
|
| 300 |
+
print("? �y�z��r���R����")
|
| 301 |
+
|
| 302 |
+
# �����ʤ��R
|
| 303 |
+
correlation = self.correlation_analysis()
|
| 304 |
+
print("? �����ʤ��R����")
|
| 305 |
+
|
| 306 |
+
# �ͦ��}��
|
| 307 |
+
insights = self.generate_insights()
|
| 308 |
+
print("? �}��ͦ�����")
|
| 309 |
+
|
| 310 |
+
self.analysis_results['insights'] = insights
|
| 311 |
+
|
| 312 |
+
return self.analysis_results
|
| 313 |
+
|
| 314 |
+
def save_analysis_results(self, filename: str = "analysis_results.json"):
|
| 315 |
+
"""�x�s���R���G"""
|
| 316 |
+
try:
|
| 317 |
+
with open(f"output/{filename}", 'w', encoding='utf-8') as f:
|
| 318 |
+
json.dump(self.analysis_results, f, ensure_ascii=False, indent=2)
|
| 319 |
+
print(f"���R���G�w�x�s�� output/{filename}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f"�x�s���R���G�ɵo�Ϳ��~: {e}")
|
| 322 |
+
|
| 323 |
+
def print_summary(self):
|
| 324 |
+
"""�L�X���R�K�n"""
|
| 325 |
+
if not self.analysis_results:
|
| 326 |
+
print("�S�����R���G�i���")
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
print("\n" + "="*50)
|
| 330 |
+
print("���������s�ϯ��Υ������R���i")
|
| 331 |
+
print("="*50)
|
| 332 |
+
|
| 333 |
+
if 'insights' in self.analysis_results:
|
| 334 |
+
print("\n? ���n�}��:")
|
| 335 |
+
for i, insight in enumerate(self.analysis_results['insights'], 1):
|
| 336 |
+
print(f"{i}. {insight}")
|
| 337 |
+
|
| 338 |
+
if 'basic_stats' in self.analysis_results:
|
| 339 |
+
stats = self.analysis_results['basic_stats']
|
| 340 |
+
print(f"\n? �����έp:")
|
| 341 |
+
print(f" ��������: {stats['price_stats']['mean']:,} ��")
|
| 342 |
+
print(f" �����: {stats['price_stats']['median']:,} ��")
|
| 343 |
+
print(f" �̧C����: {stats['price_stats']['min']:,} ��")
|
| 344 |
+
print(f" �̰�����: {stats['price_stats']['max']:,} ��")
|
| 345 |
+
print(f" �зǮt: {stats['price_stats']['std']:,} ��")
|
| 346 |
+
|
| 347 |
+
if 'price_distribution' in self.analysis_results:
|
| 348 |
+
print(f"\n? ��������:")
|
| 349 |
+
dist = self.analysis_results['price_distribution']
|
| 350 |
+
for range_name, count, percentage in zip(dist['ranges'], dist['counts'], dist['percentages']):
|
| 351 |
+
print(f" {range_name}: {count} �� ({percentage}%)")
|
| 352 |
+
|
| 353 |
+
print("\n" + "="*50)
|
| 354 |
+
|
| 355 |
+
if __name__ == "__main__":
|
| 356 |
+
# ���դ��R��
|
| 357 |
+
analyzer = RentalDataAnalyzer()
|
| 358 |
+
|
| 359 |
+
# ���J���
|
| 360 |
+
df = analyzer.load_data("output/rental_data.csv")
|
| 361 |
+
|
| 362 |
+
if df is not None:
|
| 363 |
+
# �M�~���
|
| 364 |
+
analyzer.clean_data()
|
| 365 |
+
|
| 366 |
+
# ���槹����R
|
| 367 |
+
results = analyzer.run_full_analysis()
|
| 368 |
+
|
| 369 |
+
# �x�s���G
|
| 370 |
+
analyzer.save_analysis_results()
|
| 371 |
+
|
| 372 |
+
# ��ܺK�n
|
| 373 |
+
analyzer.print_summary()
|
main.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
"""
|
| 3 |
+
591���θ�Ƥ��R�� - �D�{��
|
| 4 |
+
���������s�ϯ��Υ������R�u��
|
| 5 |
+
|
| 6 |
+
���{����X�F�������ΡB��Ƥ��R�M��ı�ƥ\��A
|
| 7 |
+
�M���Ω���R591���κ������θ�ơC
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import argparse
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
|
| 15 |
+
# �[�J�۹���|
|
| 16 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 17 |
+
|
| 18 |
+
from scraper import Rent591Scraper
|
| 19 |
+
from analyzer import RentalDataAnalyzer
|
| 20 |
+
from visualizer import RentalDataVisualizer
|
| 21 |
+
from utils import log_message, create_output_directories, get_current_timestamp
|
| 22 |
+
|
| 23 |
+
class RentalAnalysisApp:
|
| 24 |
+
"""591���Τ��R���ε{���D���O"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
self.scraper = Rent591Scraper()
|
| 28 |
+
self.analyzer = RentalDataAnalyzer()
|
| 29 |
+
self.visualizer = RentalDataVisualizer()
|
| 30 |
+
self.timestamp = get_current_timestamp()
|
| 31 |
+
|
| 32 |
+
def run_full_pipeline(self, max_pages: int = 5, skip_scraping: bool = False):
|
| 33 |
+
"""���槹�㪺���R�y�{"""
|
| 34 |
+
print("? 591���θ�Ƥ��R���Ұ�")
|
| 35 |
+
print("=" * 50)
|
| 36 |
+
|
| 37 |
+
# �Ыؿ�X�ؿ�
|
| 38 |
+
create_output_directories()
|
| 39 |
+
|
| 40 |
+
# �B�J1: ��ƪ���
|
| 41 |
+
if not skip_scraping:
|
| 42 |
+
log_message("�}�l����591����...")
|
| 43 |
+
rental_data = self.scraper.scrape_rental_data(max_pages=max_pages)
|
| 44 |
+
|
| 45 |
+
if not rental_data:
|
| 46 |
+
log_message("������������ơA�{���פ�", "ERROR")
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
log_message(f"���\���� {len(rental_data)} �����")
|
| 50 |
+
|
| 51 |
+
# �x�s��l���
|
| 52 |
+
self.scraper.save_data(rental_data, f"raw_data_{self.timestamp}.json")
|
| 53 |
+
|
| 54 |
+
# �ഫ��CSV
|
| 55 |
+
df = self.scraper.to_dataframe(rental_data)
|
| 56 |
+
csv_filename = f"output/rental_data_{self.timestamp}.csv"
|
| 57 |
+
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
|
| 58 |
+
log_message(f"��Ƥw�x�s��CSV: {csv_filename}")
|
| 59 |
+
|
| 60 |
+
# �ϥγ̷s������ɮ�
|
| 61 |
+
data_file = csv_filename
|
| 62 |
+
else:
|
| 63 |
+
# �M��̷s������ɮ�
|
| 64 |
+
data_files = [f for f in os.listdir("output") if f.startswith("rental_data") and f.endswith(".csv")]
|
| 65 |
+
if not data_files:
|
| 66 |
+
log_message("�䤣��{������ɮסA�Х����檦��", "ERROR")
|
| 67 |
+
return False
|
| 68 |
+
data_file = f"output/{sorted(data_files)[-1]}"
|
| 69 |
+
log_message(f"�ϥβ{������ɮ�: {data_file}")
|
| 70 |
+
|
| 71 |
+
# �B�J2: ��Ƥ��R
|
| 72 |
+
log_message("�}�l��Ƥ��R...")
|
| 73 |
+
|
| 74 |
+
# ���J���
|
| 75 |
+
self.analyzer.load_data(data_file)
|
| 76 |
+
|
| 77 |
+
# �M�~���
|
| 78 |
+
cleaned_df = self.analyzer.clean_data()
|
| 79 |
+
if cleaned_df is None or len(cleaned_df) == 0:
|
| 80 |
+
log_message("��ƲM�~��S�����ĸ��", "ERROR")
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
# ���槹����R
|
| 84 |
+
analysis_results = self.analyzer.run_full_analysis()
|
| 85 |
+
|
| 86 |
+
# �x�s���R���G
|
| 87 |
+
results_filename = f"analysis_results_{self.timestamp}.json"
|
| 88 |
+
self.analyzer.save_analysis_results(results_filename)
|
| 89 |
+
|
| 90 |
+
# ��ܤ��R�K�n
|
| 91 |
+
self.analyzer.print_summary()
|
| 92 |
+
|
| 93 |
+
# �B�J3: ��Ƶ�ı��
|
| 94 |
+
log_message("�}�l�ͦ���ı�ƹϪ�...")
|
| 95 |
+
|
| 96 |
+
# �]�m��ı�ƾ�
|
| 97 |
+
self.visualizer.df = cleaned_df
|
| 98 |
+
self.visualizer.analysis_results = analysis_results
|
| 99 |
+
|
| 100 |
+
# �ͦ��Ҧ��Ϫ�
|
| 101 |
+
self.visualizer.generate_all_visualizations()
|
| 102 |
+
|
| 103 |
+
# �ЫغK�n���i
|
| 104 |
+
summary_filename = f"output/summary_report_{self.timestamp}.png"
|
| 105 |
+
self.visualizer.create_summary_report(summary_filename)
|
| 106 |
+
|
| 107 |
+
log_message("���R�����I", "SUCCESS")
|
| 108 |
+
self.print_completion_summary()
|
| 109 |
+
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
def print_completion_summary(self):
|
| 113 |
+
"""�L�X�����K�n"""
|
| 114 |
+
print("\n" + "? ���R�����I" + "?")
|
| 115 |
+
print("=" * 50)
|
| 116 |
+
print("? ��X�ɮ�:")
|
| 117 |
+
print(f" �u�w�w ��l���: output/raw_data_{self.timestamp}.json")
|
| 118 |
+
print(f" �u�w�w �M�~���: output/rental_data_{self.timestamp}.csv")
|
| 119 |
+
print(f" �u�w�w ���R���G: output/analysis_results_{self.timestamp}.json")
|
| 120 |
+
print(f" �u�w�w �K�n���i: output/summary_report_{self.timestamp}.png")
|
| 121 |
+
print(" �u�w�w �Ϫ��ɮ�:")
|
| 122 |
+
print(" �x �u�w�w output/price_distribution.png")
|
| 123 |
+
print(" �x �u�w�w output/price_ranges.png")
|
| 124 |
+
print(" �x �u�w�w output/area_analysis.png")
|
| 125 |
+
print(" �x �u�w�w output/price_per_ping.png")
|
| 126 |
+
print(" �x �|�w�w output/keywords_analysis.png")
|
| 127 |
+
print(" �|�w�w ���ʦ������O: output/dashboard.html")
|
| 128 |
+
print("\n? ����: ���} dashboard.html �i�d�ݤ��ʦ����R���G")
|
| 129 |
+
print("=" * 50)
|
| 130 |
+
|
| 131 |
+
def main():
|
| 132 |
+
"""�D���"""
|
| 133 |
+
parser = argparse.ArgumentParser(description='591���θ�Ƥ��R��')
|
| 134 |
+
parser.add_argument('--max-pages', type=int, default=5,
|
| 135 |
+
help='�̤j�������� (�w�]: 5)')
|
| 136 |
+
parser.add_argument('--skip-scraping', action='store_true',
|
| 137 |
+
help='���L���ΡA�ϥβ{����ƶi����R')
|
| 138 |
+
parser.add_argument('--analysis-only', action='store_true',
|
| 139 |
+
help='�Ȱ�����R�A�����s�������')
|
| 140 |
+
|
| 141 |
+
args = parser.parse_args()
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
app = RentalAnalysisApp()
|
| 145 |
+
|
| 146 |
+
if args.analysis_only:
|
| 147 |
+
# �Ȥ��R�Ҧ�
|
| 148 |
+
log_message("����Ȥ��R�Ҧ�...")
|
| 149 |
+
success = app.run_full_pipeline(max_pages=0, skip_scraping=True)
|
| 150 |
+
else:
|
| 151 |
+
# ����y�{
|
| 152 |
+
success = app.run_full_pipeline(
|
| 153 |
+
max_pages=args.max_pages,
|
| 154 |
+
skip_scraping=args.skip_scraping
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
if success:
|
| 158 |
+
log_message("�{�����榨�\�����I", "SUCCESS")
|
| 159 |
+
return 0
|
| 160 |
+
else:
|
| 161 |
+
log_message("�{�����楢��", "ERROR")
|
| 162 |
+
return 1
|
| 163 |
+
|
| 164 |
+
except KeyboardInterrupt:
|
| 165 |
+
log_message("�ϥΪ̤��_�{������", "WARNING")
|
| 166 |
+
return 1
|
| 167 |
+
except Exception as e:
|
| 168 |
+
log_message(f"�{������ɵo�ͥ��w�����~: {e}", "ERROR")
|
| 169 |
+
return 1
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
# �]�m�{����T
|
| 173 |
+
print("? 591���θ�Ƥ��R��")
|
| 174 |
+
print("? �ؼаϰ�: ���������s��")
|
| 175 |
+
print("? ��������: 2�СB��h�B�q��j��")
|
| 176 |
+
print("? ��X Hugging Face �ͺA�t��")
|
| 177 |
+
print("-" * 50)
|
| 178 |
+
|
| 179 |
+
exit_code = main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
beautifulsoup4>=4.12.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
matplotlib>=3.7.0
|
| 7 |
+
seaborn>=0.12.0
|
| 8 |
+
transformers>=4.30.0
|
| 9 |
+
datasets>=2.14.0
|
| 10 |
+
plotly>=5.15.0
|
| 11 |
+
jupyter>=1.0.0
|
| 12 |
+
lxml>=4.9.0
|
| 13 |
+
selenium>=4.10.0
|
| 14 |
+
webdriver-manager>=3.8.0
|
scraper.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
import requests
|
| 3 |
+
import time
|
| 4 |
+
import json
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from selenium import webdriver
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.chrome.service import Service
|
| 10 |
+
from selenium.webdriver.chrome.options import Options
|
| 11 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 12 |
+
import re
|
| 13 |
+
from typing import List, Dict, Optional
|
| 14 |
+
|
| 15 |
+
class Rent591Scraper:
|
| 16 |
+
"""591����������O"""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.base_url = "https://rent.591.com.tw"
|
| 20 |
+
self.headers = {
|
| 21 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 22 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 23 |
+
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
| 24 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 25 |
+
'Connection': 'keep-alive',
|
| 26 |
+
'Upgrade-Insecure-Requests': '1',
|
| 27 |
+
}
|
| 28 |
+
self.session = requests.Session()
|
| 29 |
+
self.session.headers.update(self.headers)
|
| 30 |
+
|
| 31 |
+
def setup_driver(self):
|
| 32 |
+
"""�]�mChrome WebDriver"""
|
| 33 |
+
chrome_options = Options()
|
| 34 |
+
chrome_options.add_argument('--headless') # �L�Y�Ҧ�
|
| 35 |
+
chrome_options.add_argument('--no-sandbox')
|
| 36 |
+
chrome_options.add_argument('--disable-dev-shm-usage')
|
| 37 |
+
chrome_options.add_argument('--disable-gpu')
|
| 38 |
+
chrome_options.add_argument('--window-size=1920,1080')
|
| 39 |
+
chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}')
|
| 40 |
+
|
| 41 |
+
service = Service(ChromeDriverManager().install())
|
| 42 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 43 |
+
return driver
|
| 44 |
+
|
| 45 |
+
def get_csrf_token(self, driver):
|
| 46 |
+
"""���CSRF Token"""
|
| 47 |
+
try:
|
| 48 |
+
# �X�ݭ������token
|
| 49 |
+
driver.get("https://rent.591.com.tw/")
|
| 50 |
+
time.sleep(2)
|
| 51 |
+
|
| 52 |
+
# ���ձq����������token
|
| 53 |
+
token_element = driver.find_element(By.NAME, "csrf-token")
|
| 54 |
+
if token_element:
|
| 55 |
+
return token_element.get_attribute("content")
|
| 56 |
+
|
| 57 |
+
# �p�G�S���A���ձqcookies�����
|
| 58 |
+
cookies = driver.get_cookies()
|
| 59 |
+
for cookie in cookies:
|
| 60 |
+
if 'token' in cookie['name'].lower():
|
| 61 |
+
return cookie['value']
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"���token����: {e}")
|
| 65 |
+
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def scrape_rental_data(self, max_pages: int = 10) -> List[Dict]:
|
| 69 |
+
"""
|
| 70 |
+
��������
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
max_pages: �̤j��������
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
���θ�ƦC��
|
| 77 |
+
"""
|
| 78 |
+
driver = self.setup_driver()
|
| 79 |
+
all_data = []
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
# �ؼ�URL�Ѽ�
|
| 83 |
+
params = {
|
| 84 |
+
'region': '17', # ������
|
| 85 |
+
'section': '247', # ���s��
|
| 86 |
+
'kind': '1', # ��h���a
|
| 87 |
+
'layout': '2', # 2��
|
| 88 |
+
'shape': '2' # �q��j��
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
for page in range(1, max_pages + 1):
|
| 92 |
+
print(f"���b������ {page} ��...")
|
| 93 |
+
|
| 94 |
+
# �c��URL
|
| 95 |
+
params['page'] = page
|
| 96 |
+
url = f"{self.base_url}/list?" + "&".join([f"{k}={v}" for k, v in params.items()])
|
| 97 |
+
|
| 98 |
+
driver.get(url)
|
| 99 |
+
time.sleep(3) # ���ݭ������J
|
| 100 |
+
|
| 101 |
+
# �ˬd�O�_�����
|
| 102 |
+
rental_items = driver.find_elements(By.CSS_SELECTOR, '.rent-item')
|
| 103 |
+
if not rental_items:
|
| 104 |
+
print(f"�� {page} ���S������ơA�����")
|
| 105 |
+
break
|
| 106 |
+
|
| 107 |
+
page_data = self.parse_page_data(driver)
|
| 108 |
+
all_data.extend(page_data)
|
| 109 |
+
|
| 110 |
+
print(f"�� {page} ����� {len(page_data)} �����")
|
| 111 |
+
|
| 112 |
+
# �קK�Q��IP�A�[�J����
|
| 113 |
+
time.sleep(2)
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"������Ʈɵo�Ϳ��~: {e}")
|
| 117 |
+
finally:
|
| 118 |
+
driver.quit()
|
| 119 |
+
|
| 120 |
+
return all_data
|
| 121 |
+
|
| 122 |
+
def parse_page_data(self, driver) -> List[Dict]:
|
| 123 |
+
"""�ѪR�歶���"""
|
| 124 |
+
page_data = []
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# �������HTML
|
| 128 |
+
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
| 129 |
+
|
| 130 |
+
# �d�䯲�ζ���
|
| 131 |
+
rental_items = soup.find_all('div', class_='rent-item')
|
| 132 |
+
|
| 133 |
+
for item in rental_items:
|
| 134 |
+
try:
|
| 135 |
+
rental_info = self.extract_rental_info(item)
|
| 136 |
+
if rental_info:
|
| 137 |
+
page_data.append(rental_info)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"�ѪR�浧��Ʈɵo�Ϳ��~: {e}")
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
except Exception as e:
|
| 143 |
+
print(f"�ѪR������Ʈɵo�Ϳ��~: {e}")
|
| 144 |
+
|
| 145 |
+
return page_data
|
| 146 |
+
|
| 147 |
+
def extract_rental_info(self, item) -> Optional[Dict]:
|
| 148 |
+
"""�����浧���θ�T"""
|
| 149 |
+
try:
|
| 150 |
+
# ��T
|
| 151 |
+
title_elem = item.find('h3', class_='rent-item-title')
|
| 152 |
+
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
| 153 |
+
|
| 154 |
+
# ����
|
| 155 |
+
price_elem = item.find('div', class_='rent-item-price')
|
| 156 |
+
price_text = price_elem.get_text(strip=True) if price_elem else "0"
|
| 157 |
+
price = self.extract_price(price_text)
|
| 158 |
+
|
| 159 |
+
# �a�}
|
| 160 |
+
address_elem = item.find('div', class_='rent-item-address')
|
| 161 |
+
address = address_elem.get_text(strip=True) if address_elem else "N/A"
|
| 162 |
+
|
| 163 |
+
# �ԲӸ�T
|
| 164 |
+
info_elem = item.find('div', class_='rent-item-info')
|
| 165 |
+
info_text = info_elem.get_text(strip=True) if info_elem else ""
|
| 166 |
+
|
| 167 |
+
# �����W�ơB�Ӽh����T
|
| 168 |
+
area = self.extract_area(info_text)
|
| 169 |
+
floor = self.extract_floor(info_text)
|
| 170 |
+
|
| 171 |
+
# �s��
|
| 172 |
+
link_elem = item.find('a')
|
| 173 |
+
link = self.base_url + link_elem.get('href') if link_elem and link_elem.get('href') else ""
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
'title': title,
|
| 177 |
+
'price': price,
|
| 178 |
+
'address': address,
|
| 179 |
+
'area': area,
|
| 180 |
+
'floor': floor,
|
| 181 |
+
'link': link,
|
| 182 |
+
'raw_info': info_text
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"�������θ�T�ɵo�Ϳ��~: {e}")
|
| 187 |
+
return None
|
| 188 |
+
|
| 189 |
+
def extract_price(self, price_text: str) -> int:
|
| 190 |
+
"""���������Ʀr"""
|
| 191 |
+
try:
|
| 192 |
+
# �����D�Ʀr�r�šA��������
|
| 193 |
+
price_match = re.search(r'[\d,]+', price_text.replace(',', ''))
|
| 194 |
+
if price_match:
|
| 195 |
+
return int(price_match.group().replace(',', ''))
|
| 196 |
+
except:
|
| 197 |
+
pass
|
| 198 |
+
return 0
|
| 199 |
+
|
| 200 |
+
def extract_area(self, info_text: str) -> float:
|
| 201 |
+
"""�����W��"""
|
| 202 |
+
try:
|
| 203 |
+
area_match = re.search(r'(\d+(?:\.\d+)?)\s*�W', info_text)
|
| 204 |
+
if area_match:
|
| 205 |
+
return float(area_match.group(1))
|
| 206 |
+
except:
|
| 207 |
+
pass
|
| 208 |
+
return 0.0
|
| 209 |
+
|
| 210 |
+
def extract_floor(self, info_text: str) -> str:
|
| 211 |
+
"""�����Ӽh��T"""
|
| 212 |
+
try:
|
| 213 |
+
floor_match = re.search(r'(\d+)��', info_text)
|
| 214 |
+
if floor_match:
|
| 215 |
+
return floor_match.group(1) + '��'
|
| 216 |
+
except:
|
| 217 |
+
pass
|
| 218 |
+
return "N/A"
|
| 219 |
+
|
| 220 |
+
def save_data(self, data: List[Dict], filename: str = "rental_data.json"):
|
| 221 |
+
"""�x�s��ƨ��ɮ�"""
|
| 222 |
+
try:
|
| 223 |
+
with open(f"output/{filename}", 'w', encoding='utf-8') as f:
|
| 224 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 225 |
+
print(f"��Ƥw�x�s�� output/{filename}")
|
| 226 |
+
except Exception as e:
|
| 227 |
+
print(f"�x�s��Ʈɵo�Ϳ��~: {e}")
|
| 228 |
+
|
| 229 |
+
def to_dataframe(self, data: List[Dict]) -> pd.DataFrame:
|
| 230 |
+
"""�ഫ��DataFrame"""
|
| 231 |
+
return pd.DataFrame(data)
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
scraper = Rent591Scraper()
|
| 235 |
+
print("�}�l����591����...")
|
| 236 |
+
|
| 237 |
+
# �������
|
| 238 |
+
rental_data = scraper.scrape_rental_data(max_pages=5)
|
| 239 |
+
|
| 240 |
+
if rental_data:
|
| 241 |
+
print(f"�`�@������ {len(rental_data)} �����")
|
| 242 |
+
|
| 243 |
+
# �x�s��l���
|
| 244 |
+
scraper.save_data(rental_data)
|
| 245 |
+
|
| 246 |
+
# �ഫ��DataFrame���x�sCSV
|
| 247 |
+
df = scraper.to_dataframe(rental_data)
|
| 248 |
+
df.to_csv("output/rental_data.csv", index=False, encoding='utf-8-sig')
|
| 249 |
+
|
| 250 |
+
print("��ƪ��������I")
|
| 251 |
+
print(df.head())
|
| 252 |
+
else:
|
| 253 |
+
print("�S�������������")
|
utils.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
def log_message(message: str, level: str = "INFO"):
|
| 7 |
+
"""�O����x�T��"""
|
| 8 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 9 |
+
print(f"[{timestamp}] {level}: {message}")
|
| 10 |
+
|
| 11 |
+
def save_json(data, filename: str, output_dir: str = "output"):
|
| 12 |
+
"""�x�sJSON�榡���"""
|
| 13 |
+
try:
|
| 14 |
+
filepath = f"{output_dir}/{filename}"
|
| 15 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 16 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 17 |
+
log_message(f"��Ƥw�x�s�� {filepath}")
|
| 18 |
+
return True
|
| 19 |
+
except Exception as e:
|
| 20 |
+
log_message(f"�x�s��Ʈɵo�Ϳ��~: {e}", "ERROR")
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
def load_json(filename: str, output_dir: str = "output"):
|
| 24 |
+
"""���JJSON�榡���"""
|
| 25 |
+
try:
|
| 26 |
+
filepath = f"{output_dir}/{filename}"
|
| 27 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 28 |
+
data = json.load(f)
|
| 29 |
+
log_message(f"���\���J {filepath}")
|
| 30 |
+
return data
|
| 31 |
+
except Exception as e:
|
| 32 |
+
log_message(f"���J��Ʈɵo�Ϳ��~: {e}", "ERROR")
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def format_currency(amount: float) -> str:
|
| 36 |
+
"""�榡�ƪ��B���"""
|
| 37 |
+
if amount >= 10000:
|
| 38 |
+
return f"{amount:,.0f}"
|
| 39 |
+
else:
|
| 40 |
+
return f"{amount:.0f}"
|
| 41 |
+
|
| 42 |
+
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
|
| 43 |
+
"""�w�����k�A�קK���s���~"""
|
| 44 |
+
try:
|
| 45 |
+
if denominator == 0:
|
| 46 |
+
return default
|
| 47 |
+
return numerator / denominator
|
| 48 |
+
except:
|
| 49 |
+
return default
|
| 50 |
+
|
| 51 |
+
def clean_text(text: str) -> str:
|
| 52 |
+
"""�M�z��r���e"""
|
| 53 |
+
if not text:
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
# �����h�l�ť�
|
| 57 |
+
text = " ".join(text.split())
|
| 58 |
+
|
| 59 |
+
# �����S���r�š]�O�d����B�^��B�Ʀr�M�`�μ��I�^
|
| 60 |
+
import re
|
| 61 |
+
text = re.sub(r'[^\u4e00-\u9fff\w\s.,!?;:()�]�^�i�j�u�v\-]', '', text)
|
| 62 |
+
|
| 63 |
+
return text.strip()
|
| 64 |
+
|
| 65 |
+
def retry_request(func, max_retries: int = 3, delay: float = 1.0):
|
| 66 |
+
"""���վ���"""
|
| 67 |
+
for attempt in range(max_retries):
|
| 68 |
+
try:
|
| 69 |
+
return func()
|
| 70 |
+
except Exception as e:
|
| 71 |
+
if attempt == max_retries - 1:
|
| 72 |
+
raise e
|
| 73 |
+
log_message(f"�ШD���ѡA{delay}���᭫��... (���� {attempt + 1}/{max_retries})", "WARNING")
|
| 74 |
+
time.sleep(delay)
|
| 75 |
+
delay *= 2 # ���ưh��
|
| 76 |
+
|
| 77 |
+
def validate_price(price_str: str) -> bool:
|
| 78 |
+
"""���һ���榡�O�_�X�z"""
|
| 79 |
+
try:
|
| 80 |
+
import re
|
| 81 |
+
# �����Ʀr
|
| 82 |
+
price_match = re.search(r'[\d,]+', price_str.replace(',', ''))
|
| 83 |
+
if price_match:
|
| 84 |
+
price = int(price_match.group().replace(',', ''))
|
| 85 |
+
# �X�z�������d��G5000 - 100000
|
| 86 |
+
return 5000 <= price <= 100000
|
| 87 |
+
except:
|
| 88 |
+
pass
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
def validate_area(area_str: str) -> bool:
|
| 92 |
+
"""���ҩW�Ʈ榡�O�_�X�z"""
|
| 93 |
+
try:
|
| 94 |
+
import re
|
| 95 |
+
area_match = re.search(r'(\d+(?:\.\d+)?)', area_str)
|
| 96 |
+
if area_match:
|
| 97 |
+
area = float(area_match.group(1))
|
| 98 |
+
# �X�z���W�ƽd��G10 - 100�W
|
| 99 |
+
return 10 <= area <= 100
|
| 100 |
+
except:
|
| 101 |
+
pass
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
def create_output_directories():
|
| 105 |
+
"""�Ыؿ�X�ؿ�"""
|
| 106 |
+
import os
|
| 107 |
+
directories = ['output', 'output/images', 'output/data', 'output/reports']
|
| 108 |
+
|
| 109 |
+
for directory in directories:
|
| 110 |
+
if not os.path.exists(directory):
|
| 111 |
+
os.makedirs(directory)
|
| 112 |
+
log_message(f"�Ыإؿ�: {directory}")
|
| 113 |
+
|
| 114 |
+
def get_current_timestamp() -> str:
|
| 115 |
+
"""������e�ɶ��W"""
|
| 116 |
+
return datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 117 |
+
|
| 118 |
+
def calculate_statistics(data_list):
|
| 119 |
+
"""�p��έp�ƾ�"""
|
| 120 |
+
if not data_list:
|
| 121 |
+
return {}
|
| 122 |
+
|
| 123 |
+
import numpy as np
|
| 124 |
+
|
| 125 |
+
data_array = np.array(data_list)
|
| 126 |
+
|
| 127 |
+
return {
|
| 128 |
+
'count': len(data_array),
|
| 129 |
+
'mean': float(np.mean(data_array)),
|
| 130 |
+
'median': float(np.median(data_array)),
|
| 131 |
+
'std': float(np.std(data_array)),
|
| 132 |
+
'min': float(np.min(data_array)),
|
| 133 |
+
'max': float(np.max(data_array)),
|
| 134 |
+
'q25': float(np.percentile(data_array, 25)),
|
| 135 |
+
'q75': float(np.percentile(data_array, 75))
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
def progress_bar(current: int, total: int, length: int = 50):
|
| 139 |
+
"""��ܶi�ױ�"""
|
| 140 |
+
if total == 0:
|
| 141 |
+
return
|
| 142 |
+
|
| 143 |
+
percent = (current / total) * 100
|
| 144 |
+
filled = int(length * current // total)
|
| 145 |
+
bar = '�i' * filled + '-' * (length - filled)
|
| 146 |
+
|
| 147 |
+
print(f'\r�i��: |{bar}| {percent:.1f}% ({current}/{total})', end='', flush=True)
|
| 148 |
+
|
| 149 |
+
if current >= total:
|
| 150 |
+
print() # �����ᴫ��
|
visualizer.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# �� Copilot �ͦ�
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import plotly.express as px
|
| 7 |
+
import plotly.graph_objects as go
|
| 8 |
+
from plotly.subplots import make_subplots
|
| 9 |
+
import json
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
# �]�w����r��
|
| 13 |
+
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei', 'SimHei', 'Arial Unicode MS']
|
| 14 |
+
plt.rcParams['axes.unicode_minus'] = False
|
| 15 |
+
|
| 16 |
+
class RentalDataVisualizer:
|
| 17 |
+
"""���θ�Ƶ�ı�ƾ�"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, df: pd.DataFrame = None, analysis_results: Dict = None):
|
| 20 |
+
"""
|
| 21 |
+
��l�Ƶ�ı�ƾ�
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
df: ���DataFrame
|
| 25 |
+
analysis_results: ���R���G�r��
|
| 26 |
+
"""
|
| 27 |
+
self.df = df
|
| 28 |
+
self.analysis_results = analysis_results
|
| 29 |
+
self.colors = px.colors.qualitative.Set3
|
| 30 |
+
|
| 31 |
+
def load_data(self, data_path: str):
|
| 32 |
+
"""���J���"""
|
| 33 |
+
try:
|
| 34 |
+
if data_path.endswith('.csv'):
|
| 35 |
+
self.df = pd.read_csv(data_path, encoding='utf-8-sig')
|
| 36 |
+
else:
|
| 37 |
+
raise ValueError("�д���CSV�榡������ɮ�")
|
| 38 |
+
print(f"���\���J {len(self.df)} ����ƥΩ��ı��")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"���J��Ʈɵo�Ϳ��~: {e}")
|
| 41 |
+
|
| 42 |
+
def load_analysis_results(self, results_path: str):
|
| 43 |
+
"""���J���R���G"""
|
| 44 |
+
try:
|
| 45 |
+
with open(results_path, 'r', encoding='utf-8') as f:
|
| 46 |
+
self.analysis_results = json.load(f)
|
| 47 |
+
print("���R���G���J���\")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"���J���R���G�ɵo�Ϳ��~: {e}")
|
| 50 |
+
|
| 51 |
+
def plot_price_distribution(self, save_path: str = "output/price_distribution.png"):
|
| 52 |
+
"""ø�s����������"""
|
| 53 |
+
if self.df is None or 'price' not in self.df.columns:
|
| 54 |
+
print("�L�kø�s���������ϡG�ʤָ��")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 58 |
+
|
| 59 |
+
# �����
|
| 60 |
+
ax1.hist(self.df['price'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
|
| 61 |
+
ax1.set_xlabel('���� (��)')
|
| 62 |
+
ax1.set_ylabel('����ƶq')
|
| 63 |
+
ax1.set_title('�������������')
|
| 64 |
+
ax1.grid(True, alpha=0.3)
|
| 65 |
+
|
| 66 |
+
# �c�ι�
|
| 67 |
+
ax2.boxplot(self.df['price'], vert=True, patch_artist=True,
|
| 68 |
+
boxprops=dict(facecolor='lightgreen', alpha=0.7))
|
| 69 |
+
ax2.set_ylabel('���� (��)')
|
| 70 |
+
ax2.set_title('���������c�ι�')
|
| 71 |
+
ax2.grid(True, alpha=0.3)
|
| 72 |
+
|
| 73 |
+
plt.tight_layout()
|
| 74 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 75 |
+
plt.close()
|
| 76 |
+
print(f"���������Ϥw�x�s: {save_path}")
|
| 77 |
+
|
| 78 |
+
def plot_price_ranges(self, save_path: str = "output/price_ranges.png"):
|
| 79 |
+
"""ø�s�����϶�������"""
|
| 80 |
+
if not self.analysis_results or 'price_distribution' not in self.analysis_results:
|
| 81 |
+
print("�L�kø�s�����϶��ϡG�ʤ֤��R���G")
|
| 82 |
+
return
|
| 83 |
+
|
| 84 |
+
dist_data = self.analysis_results['price_distribution']
|
| 85 |
+
|
| 86 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 87 |
+
|
| 88 |
+
# ������
|
| 89 |
+
bars = ax1.bar(dist_data['ranges'], dist_data['counts'],
|
| 90 |
+
color=self.colors[:len(dist_data['ranges'])], alpha=0.8)
|
| 91 |
+
ax1.set_xlabel('�����϶�')
|
| 92 |
+
ax1.set_ylabel('����ƶq')
|
| 93 |
+
ax1.set_title('�U�����϶�����ƶq')
|
| 94 |
+
ax1.tick_params(axis='x', rotation=45)
|
| 95 |
+
|
| 96 |
+
# �b�����W��ܼƭ�
|
| 97 |
+
for bar, count in zip(bars, dist_data['counts']):
|
| 98 |
+
height = bar.get_height()
|
| 99 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5,
|
| 100 |
+
f'{count}', ha='center', va='bottom')
|
| 101 |
+
|
| 102 |
+
# ����
|
| 103 |
+
ax2.pie(dist_data['percentages'], labels=dist_data['ranges'], autopct='%1.1f%%',
|
| 104 |
+
colors=self.colors[:len(dist_data['ranges'])], startangle=90)
|
| 105 |
+
ax2.set_title('�����϶���Ҥ���')
|
| 106 |
+
|
| 107 |
+
plt.tight_layout()
|
| 108 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 109 |
+
plt.close()
|
| 110 |
+
print(f"�����϶��Ϥw�x�s: {save_path}")
|
| 111 |
+
|
| 112 |
+
def plot_area_analysis(self, save_path: str = "output/area_analysis.png"):
|
| 113 |
+
"""ø�s�W�Ƥ��R��"""
|
| 114 |
+
if self.df is None or 'area' not in self.df.columns:
|
| 115 |
+
print("�L�kø�s�W�Ƥ��R�ϡG�ʤָ��")
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
# �����ŭ�
|
| 119 |
+
area_data = self.df['area'].dropna()
|
| 120 |
+
|
| 121 |
+
if len(area_data) == 0:
|
| 122 |
+
print("�L�kø�s�W�Ƥ��R�ϡG�S�����Ī��W�Ƹ��")
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 126 |
+
|
| 127 |
+
# ���I�� - �W�ƻP�������Y
|
| 128 |
+
if 'price' in self.df.columns:
|
| 129 |
+
valid_data = self.df.dropna(subset=['area', 'price'])
|
| 130 |
+
if len(valid_data) > 0:
|
| 131 |
+
ax1.scatter(valid_data['area'], valid_data['price'],
|
| 132 |
+
alpha=0.6, color='coral', s=50)
|
| 133 |
+
ax1.set_xlabel('�W��')
|
| 134 |
+
ax1.set_ylabel('���� (��)')
|
| 135 |
+
ax1.set_title('�W�ƻP�������Y')
|
| 136 |
+
ax1.grid(True, alpha=0.3)
|
| 137 |
+
|
| 138 |
+
# �K�[�Ͷսu
|
| 139 |
+
z = np.polyfit(valid_data['area'], valid_data['price'], 1)
|
| 140 |
+
p = np.poly1d(z)
|
| 141 |
+
ax1.plot(valid_data['area'], p(valid_data['area']), "r--", alpha=0.8)
|
| 142 |
+
|
| 143 |
+
# �W�Ƥ��������
|
| 144 |
+
ax2.hist(area_data, bins=15, alpha=0.7, color='lightgreen', edgecolor='black')
|
| 145 |
+
ax2.set_xlabel('�W��')
|
| 146 |
+
ax2.set_ylabel('����ƶq')
|
| 147 |
+
ax2.set_title('�W�Ƥ���')
|
| 148 |
+
ax2.grid(True, alpha=0.3)
|
| 149 |
+
|
| 150 |
+
plt.tight_layout()
|
| 151 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 152 |
+
plt.close()
|
| 153 |
+
print(f"�W�Ƥ��R�Ϥw�x�s: {save_path}")
|
| 154 |
+
|
| 155 |
+
def plot_price_per_ping(self, save_path: str = "output/price_per_ping.png"):
|
| 156 |
+
"""ø�s�C�W�������R��"""
|
| 157 |
+
if self.df is None or 'price_per_ping' not in self.df.columns:
|
| 158 |
+
print("�L�kø�s�C�W�����ϡG�ʤָ��")
|
| 159 |
+
return
|
| 160 |
+
|
| 161 |
+
price_per_ping_data = self.df['price_per_ping'].dropna()
|
| 162 |
+
|
| 163 |
+
if len(price_per_ping_data) == 0:
|
| 164 |
+
print("�L�kø�s�C�W�����ϡG�S�����Ī��C�W�������")
|
| 165 |
+
return
|
| 166 |
+
|
| 167 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 168 |
+
|
| 169 |
+
# �C�W��������
|
| 170 |
+
ax1.hist(price_per_ping_data, bins=20, alpha=0.7, color='gold', edgecolor='black')
|
| 171 |
+
ax1.set_xlabel('�C�W���� (��/�W)')
|
| 172 |
+
ax1.set_ylabel('����ƶq')
|
| 173 |
+
ax1.set_title('�C�W��������')
|
| 174 |
+
ax1.grid(True, alpha=0.3)
|
| 175 |
+
|
| 176 |
+
# �c�ι�
|
| 177 |
+
ax2.boxplot(price_per_ping_data, vert=True, patch_artist=True,
|
| 178 |
+
boxprops=dict(facecolor='orange', alpha=0.7))
|
| 179 |
+
ax2.set_ylabel('�C�W���� (��/�W)')
|
| 180 |
+
ax2.set_title('�C�W�����c�ι�')
|
| 181 |
+
ax2.grid(True, alpha=0.3)
|
| 182 |
+
|
| 183 |
+
plt.tight_layout()
|
| 184 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 185 |
+
plt.close()
|
| 186 |
+
print(f"�C�W�����Ϥw�x�s: {save_path}")
|
| 187 |
+
|
| 188 |
+
def plot_keywords_analysis(self, save_path: str = "output/keywords_analysis.png"):
|
| 189 |
+
"""ø�s����r���R��"""
|
| 190 |
+
if not self.analysis_results or 'description_analysis' not in self.analysis_results:
|
| 191 |
+
print("�L�kø�s����r���R�ϡG�ʤ֤��R���G")
|
| 192 |
+
return
|
| 193 |
+
|
| 194 |
+
desc_analysis = self.analysis_results['description_analysis']
|
| 195 |
+
if 'keywords_frequency' not in desc_analysis:
|
| 196 |
+
print("�L�kø�s����r���R�ϡG�ʤ�����r���")
|
| 197 |
+
return
|
| 198 |
+
|
| 199 |
+
keywords_data = desc_analysis['keywords_frequency']
|
| 200 |
+
|
| 201 |
+
# �L�o�X���ƾڪ�����r
|
| 202 |
+
filtered_keywords = {k: v for k, v in keywords_data.items() if v > 0}
|
| 203 |
+
|
| 204 |
+
if not filtered_keywords:
|
| 205 |
+
print("�S������������r���")
|
| 206 |
+
return
|
| 207 |
+
|
| 208 |
+
keywords = list(filtered_keywords.keys())
|
| 209 |
+
frequencies = list(filtered_keywords.values())
|
| 210 |
+
|
| 211 |
+
plt.figure(figsize=(12, 8))
|
| 212 |
+
bars = plt.barh(keywords, frequencies, color=self.colors[:len(keywords)])
|
| 213 |
+
plt.xlabel('�X�{����')
|
| 214 |
+
plt.ylabel('����r')
|
| 215 |
+
plt.title('����y�z����r�W�v���R')
|
| 216 |
+
plt.grid(True, alpha=0.3, axis='x')
|
| 217 |
+
|
| 218 |
+
# �b�����W��ܼƭ�
|
| 219 |
+
for bar, freq in zip(bars, frequencies):
|
| 220 |
+
width = bar.get_width()
|
| 221 |
+
plt.text(width + 0.1, bar.get_y() + bar.get_height()/2.,
|
| 222 |
+
f'{freq}', ha='left', va='center')
|
| 223 |
+
|
| 224 |
+
plt.tight_layout()
|
| 225 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 226 |
+
plt.close()
|
| 227 |
+
print(f"����r���R�Ϥw�x�s: {save_path}")
|
| 228 |
+
|
| 229 |
+
def create_interactive_dashboard(self, save_path: str = "output/dashboard.html"):
|
| 230 |
+
"""�Ыؤ��ʦ������O"""
|
| 231 |
+
if self.df is None:
|
| 232 |
+
print("�L�k�Ыػ����O�G�ʤָ��")
|
| 233 |
+
return
|
| 234 |
+
|
| 235 |
+
# �Ыؤl��
|
| 236 |
+
fig = make_subplots(
|
| 237 |
+
rows=2, cols=2,
|
| 238 |
+
subplot_titles=('��������', '�W��vs����', '�����϶�����', '�C�W��������'),
|
| 239 |
+
specs=[[{"secondary_y": False}, {"secondary_y": False}],
|
| 240 |
+
[{"type": "bar"}, {"secondary_y": False}]]
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# 1. �������������
|
| 244 |
+
fig.add_trace(
|
| 245 |
+
go.Histogram(x=self.df['price'], name='��������', nbinsx=20,
|
| 246 |
+
marker_color='skyblue', opacity=0.7),
|
| 247 |
+
row=1, col=1
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
# 2. �W��vs�������I��
|
| 251 |
+
if 'area' in self.df.columns:
|
| 252 |
+
valid_data = self.df.dropna(subset=['area', 'price'])
|
| 253 |
+
if len(valid_data) > 0:
|
| 254 |
+
fig.add_trace(
|
| 255 |
+
go.Scatter(x=valid_data['area'], y=valid_data['price'],
|
| 256 |
+
mode='markers', name='�W��vs����',
|
| 257 |
+
marker=dict(color='coral', size=8, opacity=0.6)),
|
| 258 |
+
row=1, col=2
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
# 3. �����϶�����
|
| 262 |
+
if self.analysis_results and 'price_distribution' in self.analysis_results:
|
| 263 |
+
dist_data = self.analysis_results['price_distribution']
|
| 264 |
+
fig.add_trace(
|
| 265 |
+
go.Bar(x=dist_data['ranges'], y=dist_data['counts'],
|
| 266 |
+
name='�����϶�', marker_color='lightgreen'),
|
| 267 |
+
row=2, col=1
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
# 4. �C�W��������
|
| 271 |
+
if 'price_per_ping' in self.df.columns:
|
| 272 |
+
price_per_ping_data = self.df['price_per_ping'].dropna()
|
| 273 |
+
if len(price_per_ping_data) > 0:
|
| 274 |
+
fig.add_trace(
|
| 275 |
+
go.Histogram(x=price_per_ping_data, name='�C�W����', nbinsx=15,
|
| 276 |
+
marker_color='gold', opacity=0.7),
|
| 277 |
+
row=2, col=2
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
# ��s����
|
| 281 |
+
fig.update_layout(
|
| 282 |
+
title_text="���������s�ϯ��Υ������R�����O",
|
| 283 |
+
title_x=0.5,
|
| 284 |
+
height=800,
|
| 285 |
+
showlegend=False
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# ��s�b����
|
| 289 |
+
fig.update_xaxes(title_text="���� (��)", row=1, col=1)
|
| 290 |
+
fig.update_yaxes(title_text="����ƶq", row=1, col=1)
|
| 291 |
+
fig.update_xaxes(title_text="�W��", row=1, col=2)
|
| 292 |
+
fig.update_yaxes(title_text="���� (��)", row=1, col=2)
|
| 293 |
+
fig.update_xaxes(title_text="�����϶�", row=2, col=1)
|
| 294 |
+
fig.update_yaxes(title_text="����ƶq", row=2, col=1)
|
| 295 |
+
fig.update_xaxes(title_text="�C�W���� (��/�W)", row=2, col=2)
|
| 296 |
+
fig.update_yaxes(title_text="����ƶq", row=2, col=2)
|
| 297 |
+
|
| 298 |
+
# �x�s���ʦ��Ϫ�
|
| 299 |
+
fig.write_html(save_path)
|
| 300 |
+
print(f"���ʦ������O�w�x�s: {save_path}")
|
| 301 |
+
|
| 302 |
+
def generate_all_visualizations(self):
|
| 303 |
+
"""�ͦ��Ҧ���ı�ƹϪ�"""
|
| 304 |
+
print("�}�l�ͦ���ı�ƹϪ�...")
|
| 305 |
+
|
| 306 |
+
# �R�A�Ϫ�
|
| 307 |
+
self.plot_price_distribution()
|
| 308 |
+
self.plot_price_ranges()
|
| 309 |
+
self.plot_area_analysis()
|
| 310 |
+
self.plot_price_per_ping()
|
| 311 |
+
self.plot_keywords_analysis()
|
| 312 |
+
|
| 313 |
+
# ���ʦ������O
|
| 314 |
+
self.create_interactive_dashboard()
|
| 315 |
+
|
| 316 |
+
print("�Ҧ���ı�ƹϪ��ͦ������I")
|
| 317 |
+
|
| 318 |
+
def create_summary_report(self, save_path: str = "output/summary_report.png"):
|
| 319 |
+
"""�ЫغK�n���i��"""
|
| 320 |
+
if not self.analysis_results or 'basic_stats' not in self.analysis_results:
|
| 321 |
+
print("�L�k�ЫغK�n���i�G�ʤ֤��R���G")
|
| 322 |
+
return
|
| 323 |
+
|
| 324 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 325 |
+
ax.axis('off')
|
| 326 |
+
|
| 327 |
+
# ���D
|
| 328 |
+
fig.suptitle('���������s�ϯ��Υ������R�K�n���i', fontsize=20, fontweight='bold', y=0.95)
|
| 329 |
+
|
| 330 |
+
# �έp��T
|
| 331 |
+
stats = self.analysis_results['basic_stats']
|
| 332 |
+
|
| 333 |
+
# �Ыؤ�r���e
|
| 334 |
+
report_text = f"""
|
| 335 |
+
|
| 336 |
+
? �������p
|
| 337 |
+
? �`�����: {stats['total_properties']} ��
|
| 338 |
+
? ��ƽd��: 2�СB��h�B�q��j��
|
| 339 |
+
|
| 340 |
+
? �����έp
|
| 341 |
+
? ��������: {stats['price_stats']['mean']:,} ��
|
| 342 |
+
? ����Ư���: {stats['price_stats']['median']:,} ��
|
| 343 |
+
? �̧C����: {stats['price_stats']['min']:,} ��
|
| 344 |
+
? �̰�����: {stats['price_stats']['max']:,} ��
|
| 345 |
+
? �зǮt: {stats['price_stats']['std']:,} ��
|
| 346 |
+
|
| 347 |
+
? �����S�x
|
| 348 |
+
? �Ĥ@�|�����: {stats['price_stats']['q25']:,} ��
|
| 349 |
+
? �ĤT�|�����: {stats['price_stats']['q75']:,} ��
|
| 350 |
+
"""
|
| 351 |
+
|
| 352 |
+
# �K�[���n�έp�]�p�G�����ܡ^
|
| 353 |
+
if 'area_stats' in stats and stats['area_stats']:
|
| 354 |
+
area_stats = stats['area_stats']
|
| 355 |
+
report_text += f"""
|
| 356 |
+
? �W�Ʋέp
|
| 357 |
+
? �����W��: {area_stats['mean']} �W
|
| 358 |
+
? ����ƩW��: {area_stats['median']} �W
|
| 359 |
+
? �̤p�W��: {area_stats['min']} �W
|
| 360 |
+
? �̤j�W��: {area_stats['max']} �W
|
| 361 |
+
"""
|
| 362 |
+
|
| 363 |
+
# �K�[�C�W�����έp�]�p�G�����ܡ^
|
| 364 |
+
if 'price_per_ping_stats' in stats and stats['price_per_ping_stats']:
|
| 365 |
+
pp_stats = stats['price_per_ping_stats']
|
| 366 |
+
report_text += f"""
|
| 367 |
+
? �C�W�����έp
|
| 368 |
+
? �����C�W����: {pp_stats['mean']:,} ��/�W
|
| 369 |
+
? ����ƨC�W����: {pp_stats['median']:,} ��/�W
|
| 370 |
+
? �̧C�C�W����: {pp_stats['min']:,} ��/�W
|
| 371 |
+
? �̰��C�W����: {pp_stats['max']:,} ��/�W
|
| 372 |
+
"""
|
| 373 |
+
|
| 374 |
+
# �K�[�}��]�p�G�����ܡ^
|
| 375 |
+
if 'insights' in self.analysis_results:
|
| 376 |
+
report_text += "\n\n? ���n�}��\n"
|
| 377 |
+
for i, insight in enumerate(self.analysis_results['insights'], 1):
|
| 378 |
+
report_text += f"? {insight}\n"
|
| 379 |
+
|
| 380 |
+
# ��ܤ�r
|
| 381 |
+
ax.text(0.05, 0.95, report_text, transform=ax.transAxes, fontsize=12,
|
| 382 |
+
verticalalignment='top', fontfamily='monospace',
|
| 383 |
+
bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))
|
| 384 |
+
|
| 385 |
+
plt.tight_layout()
|
| 386 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 387 |
+
plt.close()
|
| 388 |
+
print(f"�K�n���i�w�x�s: {save_path}")
|
| 389 |
+
|
| 390 |
+
if __name__ == "__main__":
|
| 391 |
+
# ���յ�ı�ƾ�
|
| 392 |
+
visualizer = RentalDataVisualizer()
|
| 393 |
+
|
| 394 |
+
# ���J���
|
| 395 |
+
visualizer.load_data("output/rental_data.csv")
|
| 396 |
+
visualizer.load_analysis_results("output/analysis_results.json")
|
| 397 |
+
|
| 398 |
+
# �ͦ��Ҧ���ı�ƹϪ�
|
| 399 |
+
visualizer.generate_all_visualizations()
|
| 400 |
+
|
| 401 |
+
# �ЫغK�n���i
|
| 402 |
+
visualizer.create_summary_report()
|