54justin commited on
Commit
f205f47
·
verified ·
1 Parent(s): a0f0229

Upload 7 files

Browse files
Files changed (7) hide show
  1. 591_rental_analysis.ipynb +10 -0
  2. analyzer.py +373 -0
  3. main.py +179 -0
  4. requirements.txt +14 -0
  5. scraper.py +253 -0
  6. utils.py +150 -0
  7. visualizer.py +402 -0
591_rental_analysis.ipynb ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {
4
+ "language_info": {
5
+ "name": "python"
6
+ }
7
+ },
8
+ "nbformat": 4,
9
+ "nbformat_minor": 5
10
+ }
analyzer.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Dict, List, Tuple
5
+ import json
6
+ from transformers import pipeline, AutoTokenizer, AutoModel
7
+ from datasets import Dataset
8
+ import re
9
+
10
+ class RentalDataAnalyzer:
11
+ """���θ�Ƥ��R��"""
12
+
13
+ def __init__(self, data_path: str = None):
14
+ """
15
+ ��l�Ƥ��R��
16
+
17
+ Args:
18
+ data_path: ����ɮ׸��|
19
+ """
20
+ self.data_path = data_path
21
+ self.df = None
22
+ self.analysis_results = {}
23
+
24
+ # ��l��Hugging Face�ҫ��Ω��r���R
25
+ self.sentiment_analyzer = None
26
+ self.text_classifier = None
27
+
28
+ def load_data(self, data_path: str = None) -> pd.DataFrame:
29
+ """���J���"""
30
+ if data_path:
31
+ self.data_path = data_path
32
+
33
+ try:
34
+ if self.data_path.endswith('.json'):
35
+ with open(self.data_path, 'r', encoding='utf-8') as f:
36
+ data = json.load(f)
37
+ self.df = pd.DataFrame(data)
38
+ elif self.data_path.endswith('.csv'):
39
+ self.df = pd.read_csv(self.data_path, encoding='utf-8-sig')
40
+ else:
41
+ raise ValueError("���䴩���ɮ׮榡")
42
+
43
+ print(f"���\���J {len(self.df)} �����")
44
+ return self.df
45
+
46
+ except Exception as e:
47
+ print(f"���J��Ʈɵo�Ϳ��~: {e}")
48
+ return None
49
+
50
+ def clean_data(self) -> pd.DataFrame:
51
+ """�M�~���"""
52
+ if self.df is None:
53
+ print("�����J���")
54
+ return None
55
+
56
+ print("�}�l�M�~���...")
57
+
58
+ # �������Ƹ��
59
+ original_count = len(self.df)
60
+ self.df = self.df.drop_duplicates(subset=['title', 'address', 'price'])
61
+ print(f"���� {original_count - len(self.df)} �����Ƹ��")
62
+
63
+ # �M�z�������
64
+ self.df['price'] = pd.to_numeric(self.df['price'], errors='coerce')
65
+ self.df = self.df[self.df['price'] > 0] # �����L���
66
+
67
+ # �M�z�W�Ƹ��
68
+ self.df['area'] = pd.to_numeric(self.df['area'], errors='coerce')
69
+
70
+ # �p��C�W����
71
+ self.df['price_per_ping'] = self.df.apply(
72
+ lambda row: row['price'] / row['area'] if row['area'] > 0 else np.nan,
73
+ axis=1
74
+ )
75
+
76
+ # �������`�ȡ]�ϥ�IQR��k�^
77
+ self.df = self.remove_outliers(self.df, 'price')
78
+
79
+ print(f"�M�~��Ѿl {len(self.df)} �����ĸ��")
80
+ return self.df
81
+
82
+ def remove_outliers(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
83
+ """�������`��"""
84
+ Q1 = df[column].quantile(0.25)
85
+ Q3 = df[column].quantile(0.75)
86
+ IQR = Q3 - Q1
87
+
88
+ lower_bound = Q1 - 1.5 * IQR
89
+ upper_bound = Q3 + 1.5 * IQR
90
+
91
+ outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
92
+ print(f"���� {outliers_count} �� {column} ���`��")
93
+
94
+ return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
95
+
96
+ def basic_statistics(self) -> Dict:
97
+ """�򥻲έp���R"""
98
+ if self.df is None or len(self.df) == 0:
99
+ return {}
100
+
101
+ stats = {
102
+ 'total_properties': len(self.df),
103
+ 'price_stats': {
104
+ 'mean': round(self.df['price'].mean(), 2),
105
+ 'median': round(self.df['price'].median(), 2),
106
+ 'std': round(self.df['price'].std(), 2),
107
+ 'min': self.df['price'].min(),
108
+ 'max': self.df['price'].max(),
109
+ 'q25': round(self.df['price'].quantile(0.25), 2),
110
+ 'q75': round(self.df['price'].quantile(0.75), 2)
111
+ },
112
+ 'area_stats': {
113
+ 'mean': round(self.df['area'].mean(), 2),
114
+ 'median': round(self.df['area'].median(), 2),
115
+ 'min': self.df['area'].min(),
116
+ 'max': self.df['area'].max()
117
+ } if not self.df['area'].isna().all() else {},
118
+ 'price_per_ping_stats': {
119
+ 'mean': round(self.df['price_per_ping'].mean(), 2),
120
+ 'median': round(self.df['price_per_ping'].median(), 2),
121
+ 'min': round(self.df['price_per_ping'].min(), 2),
122
+ 'max': round(self.df['price_per_ping'].max(), 2)
123
+ } if not self.df['price_per_ping'].isna().all() else {}
124
+ }
125
+
126
+ self.analysis_results['basic_stats'] = stats
127
+ return stats
128
+
129
+ def price_distribution_analysis(self) -> Dict:
130
+ """�����������R"""
131
+ if self.df is None or len(self.df) == 0:
132
+ return {}
133
+
134
+ # �w�q�����϶�
135
+ price_bins = [0, 15000, 20000, 25000, 30000, 40000, float('inf')]
136
+ price_labels = ['<15K', '15-20K', '20-25K', '25-30K', '30-40K', '>40K']
137
+
138
+ self.df['price_range'] = pd.cut(self.df['price'], bins=price_bins, labels=price_labels, right=False)
139
+
140
+ distribution = self.df['price_range'].value_counts().sort_index()
141
+
142
+ distribution_dict = {
143
+ 'ranges': distribution.index.tolist(),
144
+ 'counts': distribution.values.tolist(),
145
+ 'percentages': (distribution / len(self.df) * 100).round(2).tolist()
146
+ }
147
+
148
+ self.analysis_results['price_distribution'] = distribution_dict
149
+ return distribution_dict
150
+
151
+ def area_analysis(self) -> Dict:
152
+ """�W�Ƥ��R"""
153
+ if self.df is None or len(self.df) == 0 or self.df['area'].isna().all():
154
+ return {}
155
+
156
+ # �w�q�W�ư϶�
157
+ area_bins = [0, 20, 30, 40, 50, float('inf')]
158
+ area_labels = ['<20�W', '20-30�W', '30-40�W', '40-50�W', '>50�W']
159
+
160
+ self.df['area_range'] = pd.cut(self.df['area'], bins=area_bins, labels=area_labels, right=False)
161
+
162
+ area_distribution = self.df['area_range'].value_counts().sort_index()
163
+
164
+ area_dict = {
165
+ 'ranges': area_distribution.index.tolist(),
166
+ 'counts': area_distribution.values.tolist(),
167
+ 'percentages': (area_distribution / len(self.df) * 100).round(2).tolist()
168
+ }
169
+
170
+ self.analysis_results['area_analysis'] = area_dict
171
+ return area_dict
172
+
173
+ def setup_huggingface_models(self):
174
+ """�]�mHugging Face�ҫ�"""
175
+ try:
176
+ print("���JHugging Face�ҫ�...")
177
+
178
+ # ���J���屡�P���R�ҫ�
179
+ self.sentiment_analyzer = pipeline(
180
+ "sentiment-analysis",
181
+ model="ckiplab/bert-base-chinese-ws",
182
+ return_all_scores=True
183
+ )
184
+
185
+ print("Hugging Face�ҫ����J����")
186
+ except Exception as e:
187
+ print(f"���JHugging Face�ҫ��ɵo�Ϳ��~: {e}")
188
+
189
+ def analyze_descriptions(self) -> Dict:
190
+ """���R����y�z��r"""
191
+ if self.df is None or 'raw_info' not in self.df.columns:
192
+ return {}
193
+
194
+ descriptions = self.df['raw_info'].dropna().tolist()
195
+
196
+ if not descriptions:
197
+ return {}
198
+
199
+ # ����r���R
200
+ keywords_analysis = self.analyze_keywords(descriptions)
201
+
202
+ analysis_result = {
203
+ 'keywords_frequency': keywords_analysis,
204
+ 'total_descriptions': len(descriptions)
205
+ }
206
+
207
+ self.analysis_results['description_analysis'] = analysis_result
208
+ return analysis_result
209
+
210
+ def analyze_keywords(self, descriptions: List[str]) -> Dict:
211
+ """���R����r�W�v"""
212
+ # �w�q�Ыά�������r
213
+ keywords = [
214
+ '�񱶹B', '�񨮯�', '�q��', '���x', '������', '�޲z�O',
215
+ '�ĥ�', '�q��', '�w�R', '�K�Q', '�ͬ�����', '�ǰ�',
216
+ '���s', '���C', '�a��', '�a�q', '�N��', '�~���'
217
+ ]
218
+
219
+ keyword_counts = {keyword: 0 for keyword in keywords}
220
+
221
+ for desc in descriptions:
222
+ for keyword in keywords:
223
+ if keyword in desc:
224
+ keyword_counts[keyword] += 1
225
+
226
+ # �ƧǨè��e10��
227
+ sorted_keywords = dict(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10])
228
+
229
+ return sorted_keywords
230
+
231
+ def correlation_analysis(self) -> Dict:
232
+ """�����ʤ��R"""
233
+ if self.df is None or len(self.df) == 0:
234
+ return {}
235
+
236
+ numeric_columns = ['price', 'area', 'price_per_ping']
237
+ available_columns = [col for col in numeric_columns if col in self.df.columns and not self.df[col].isna().all()]
238
+
239
+ if len(available_columns) < 2:
240
+ return {}
241
+
242
+ correlation_matrix = self.df[available_columns].corr()
243
+
244
+ correlation_dict = {}
245
+ for i, col1 in enumerate(available_columns):
246
+ for j, col2 in enumerate(available_columns):
247
+ if i < j: # �קK����
248
+ correlation_dict[f"{col1}_vs_{col2}"] = round(correlation_matrix.loc[col1, col2], 3)
249
+
250
+ self.analysis_results['correlation'] = correlation_dict
251
+ return correlation_dict
252
+
253
+ def generate_insights(self) -> List[str]:
254
+ """�ͦ����R�}��"""
255
+ insights = []
256
+
257
+ if 'basic_stats' in self.analysis_results:
258
+ stats = self.analysis_results['basic_stats']
259
+ insights.append(f"�@��� {stats['total_properties']} ���ŦX���󪺯��Ϊ���")
260
+ insights.append(f"���������� {stats['price_stats']['mean']:,} ��")
261
+ insights.append(f"��������Ƭ� {stats['price_stats']['median']:,} ��")
262
+
263
+ if stats['price_stats']['mean'] > stats['price_stats']['median']:
264
+ insights.append("���������V�k���סA�s�b����������԰�������")
265
+
266
+ if 'price_distribution' in self.analysis_results:
267
+ dist = self.analysis_results['price_distribution']
268
+ max_range_idx = dist['percentages'].index(max(dist['percentages']))
269
+ most_common_range = dist['ranges'][max_range_idx]
270
+ percentage = dist['percentages'][max_range_idx]
271
+ insights.append(f"�̱`���������϶��O {most_common_range}�A�� {percentage}%")
272
+
273
+ if 'area_analysis' in self.analysis_results:
274
+ area = self.analysis_results['area_analysis']
275
+ if area:
276
+ max_area_idx = area['percentages'].index(max(area['percentages']))
277
+ most_common_area = area['ranges'][max_area_idx]
278
+ insights.append(f"�̱`�����W�ư϶��O {most_common_area}")
279
+
280
+ return insights
281
+
282
+ def run_full_analysis(self) -> Dict:
283
+ """���槹����R"""
284
+ print("�}�l���槹����R...")
285
+
286
+ # �򥻲έp
287
+ basic_stats = self.basic_statistics()
288
+ print("? �򥻲έp���R����")
289
+
290
+ # �����������R
291
+ price_dist = self.price_distribution_analysis()
292
+ print("? �����������R����")
293
+
294
+ # �W�Ƥ��R
295
+ area_analysis = self.area_analysis()
296
+ print("? �W�Ƥ��R����")
297
+
298
+ # �y�z��r���R
299
+ desc_analysis = self.analyze_descriptions()
300
+ print("? �y�z��r���R����")
301
+
302
+ # �����ʤ��R
303
+ correlation = self.correlation_analysis()
304
+ print("? �����ʤ��R����")
305
+
306
+ # �ͦ��}��
307
+ insights = self.generate_insights()
308
+ print("? �}��ͦ�����")
309
+
310
+ self.analysis_results['insights'] = insights
311
+
312
+ return self.analysis_results
313
+
314
+ def save_analysis_results(self, filename: str = "analysis_results.json"):
315
+ """�x�s���R���G"""
316
+ try:
317
+ with open(f"output/{filename}", 'w', encoding='utf-8') as f:
318
+ json.dump(self.analysis_results, f, ensure_ascii=False, indent=2)
319
+ print(f"���R���G�w�x�s�� output/{filename}")
320
+ except Exception as e:
321
+ print(f"�x�s���R���G�ɵo�Ϳ��~: {e}")
322
+
323
+ def print_summary(self):
324
+ """�L�X���R�K�n"""
325
+ if not self.analysis_results:
326
+ print("�S�����R���G�i���")
327
+ return
328
+
329
+ print("\n" + "="*50)
330
+ print("���������s�ϯ��Υ������R���i")
331
+ print("="*50)
332
+
333
+ if 'insights' in self.analysis_results:
334
+ print("\n? ���n�}��:")
335
+ for i, insight in enumerate(self.analysis_results['insights'], 1):
336
+ print(f"{i}. {insight}")
337
+
338
+ if 'basic_stats' in self.analysis_results:
339
+ stats = self.analysis_results['basic_stats']
340
+ print(f"\n? �����έp:")
341
+ print(f" ��������: {stats['price_stats']['mean']:,} ��")
342
+ print(f" �����: {stats['price_stats']['median']:,} ��")
343
+ print(f" �̧C����: {stats['price_stats']['min']:,} ��")
344
+ print(f" �̰�����: {stats['price_stats']['max']:,} ��")
345
+ print(f" �зǮt: {stats['price_stats']['std']:,} ��")
346
+
347
+ if 'price_distribution' in self.analysis_results:
348
+ print(f"\n? ��������:")
349
+ dist = self.analysis_results['price_distribution']
350
+ for range_name, count, percentage in zip(dist['ranges'], dist['counts'], dist['percentages']):
351
+ print(f" {range_name}: {count} �� ({percentage}%)")
352
+
353
+ print("\n" + "="*50)
354
+
355
+ if __name__ == "__main__":
356
+ # ���դ��R��
357
+ analyzer = RentalDataAnalyzer()
358
+
359
+ # ���J���
360
+ df = analyzer.load_data("output/rental_data.csv")
361
+
362
+ if df is not None:
363
+ # �M�~���
364
+ analyzer.clean_data()
365
+
366
+ # ���槹����R
367
+ results = analyzer.run_full_analysis()
368
+
369
+ # �x�s���G
370
+ analyzer.save_analysis_results()
371
+
372
+ # ��ܺK�n
373
+ analyzer.print_summary()
main.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ """
3
+ 591���θ�Ƥ��R�� - �D�{��
4
+ ���������s�ϯ��Υ������R�u��
5
+
6
+ ���{����X�F�������ΡB��Ƥ��R�M��ı�ƥ\��A
7
+ �M���Ω���R591���κ������θ�ơC
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import argparse
13
+ from datetime import datetime
14
+
15
+ # �[�J�۹���|
16
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ from scraper import Rent591Scraper
19
+ from analyzer import RentalDataAnalyzer
20
+ from visualizer import RentalDataVisualizer
21
+ from utils import log_message, create_output_directories, get_current_timestamp
22
+
23
+ class RentalAnalysisApp:
24
+ """591���Τ��R���ε{���D���O"""
25
+
26
+ def __init__(self):
27
+ self.scraper = Rent591Scraper()
28
+ self.analyzer = RentalDataAnalyzer()
29
+ self.visualizer = RentalDataVisualizer()
30
+ self.timestamp = get_current_timestamp()
31
+
32
+ def run_full_pipeline(self, max_pages: int = 5, skip_scraping: bool = False):
33
+ """���槹�㪺���R�y�{"""
34
+ print("? 591���θ�Ƥ��R���Ұ�")
35
+ print("=" * 50)
36
+
37
+ # �Ыؿ�X�ؿ�
38
+ create_output_directories()
39
+
40
+ # �B�J1: ��ƪ���
41
+ if not skip_scraping:
42
+ log_message("�}�l����591����...")
43
+ rental_data = self.scraper.scrape_rental_data(max_pages=max_pages)
44
+
45
+ if not rental_data:
46
+ log_message("������������ơA�{���פ�", "ERROR")
47
+ return False
48
+
49
+ log_message(f"���\���� {len(rental_data)} �����")
50
+
51
+ # �x�s��l���
52
+ self.scraper.save_data(rental_data, f"raw_data_{self.timestamp}.json")
53
+
54
+ # �ഫ��CSV
55
+ df = self.scraper.to_dataframe(rental_data)
56
+ csv_filename = f"output/rental_data_{self.timestamp}.csv"
57
+ df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
58
+ log_message(f"��Ƥw�x�s��CSV: {csv_filename}")
59
+
60
+ # �ϥγ̷s������ɮ�
61
+ data_file = csv_filename
62
+ else:
63
+ # �M��̷s������ɮ�
64
+ data_files = [f for f in os.listdir("output") if f.startswith("rental_data") and f.endswith(".csv")]
65
+ if not data_files:
66
+ log_message("�䤣��{������ɮסA�Х����檦��", "ERROR")
67
+ return False
68
+ data_file = f"output/{sorted(data_files)[-1]}"
69
+ log_message(f"�ϥβ{������ɮ�: {data_file}")
70
+
71
+ # �B�J2: ��Ƥ��R
72
+ log_message("�}�l��Ƥ��R...")
73
+
74
+ # ���J���
75
+ self.analyzer.load_data(data_file)
76
+
77
+ # �M�~���
78
+ cleaned_df = self.analyzer.clean_data()
79
+ if cleaned_df is None or len(cleaned_df) == 0:
80
+ log_message("��ƲM�~��S�����ĸ��", "ERROR")
81
+ return False
82
+
83
+ # ���槹����R
84
+ analysis_results = self.analyzer.run_full_analysis()
85
+
86
+ # �x�s���R���G
87
+ results_filename = f"analysis_results_{self.timestamp}.json"
88
+ self.analyzer.save_analysis_results(results_filename)
89
+
90
+ # ��ܤ��R�K�n
91
+ self.analyzer.print_summary()
92
+
93
+ # �B�J3: ��Ƶ�ı��
94
+ log_message("�}�l�ͦ���ı�ƹϪ�...")
95
+
96
+ # �]�m��ı�ƾ�
97
+ self.visualizer.df = cleaned_df
98
+ self.visualizer.analysis_results = analysis_results
99
+
100
+ # �ͦ��Ҧ��Ϫ�
101
+ self.visualizer.generate_all_visualizations()
102
+
103
+ # �ЫغK�n���i
104
+ summary_filename = f"output/summary_report_{self.timestamp}.png"
105
+ self.visualizer.create_summary_report(summary_filename)
106
+
107
+ log_message("���R�����I", "SUCCESS")
108
+ self.print_completion_summary()
109
+
110
+ return True
111
+
112
+ def print_completion_summary(self):
113
+ """�L�X�����K�n"""
114
+ print("\n" + "? ���R�����I" + "?")
115
+ print("=" * 50)
116
+ print("? ��X�ɮ�:")
117
+ print(f" �u�w�w ��l���: output/raw_data_{self.timestamp}.json")
118
+ print(f" �u�w�w �M�~���: output/rental_data_{self.timestamp}.csv")
119
+ print(f" �u�w�w ���R���G: output/analysis_results_{self.timestamp}.json")
120
+ print(f" �u�w�w �K�n���i: output/summary_report_{self.timestamp}.png")
121
+ print(" �u�w�w �Ϫ��ɮ�:")
122
+ print(" �x �u�w�w output/price_distribution.png")
123
+ print(" �x �u�w�w output/price_ranges.png")
124
+ print(" �x �u�w�w output/area_analysis.png")
125
+ print(" �x �u�w�w output/price_per_ping.png")
126
+ print(" �x �|�w�w output/keywords_analysis.png")
127
+ print(" �|�w�w ���ʦ������O: output/dashboard.html")
128
+ print("\n? ����: ���} dashboard.html �i�d�ݤ��ʦ����R���G")
129
+ print("=" * 50)
130
+
131
+ def main():
132
+ """�D���"""
133
+ parser = argparse.ArgumentParser(description='591���θ�Ƥ��R��')
134
+ parser.add_argument('--max-pages', type=int, default=5,
135
+ help='�̤j�������� (�w�]: 5)')
136
+ parser.add_argument('--skip-scraping', action='store_true',
137
+ help='���L���ΡA�ϥβ{����ƶi����R')
138
+ parser.add_argument('--analysis-only', action='store_true',
139
+ help='�Ȱ�����R�A�����s�������')
140
+
141
+ args = parser.parse_args()
142
+
143
+ try:
144
+ app = RentalAnalysisApp()
145
+
146
+ if args.analysis_only:
147
+ # �Ȥ��R�Ҧ�
148
+ log_message("����Ȥ��R�Ҧ�...")
149
+ success = app.run_full_pipeline(max_pages=0, skip_scraping=True)
150
+ else:
151
+ # ����y�{
152
+ success = app.run_full_pipeline(
153
+ max_pages=args.max_pages,
154
+ skip_scraping=args.skip_scraping
155
+ )
156
+
157
+ if success:
158
+ log_message("�{�����榨�\�����I", "SUCCESS")
159
+ return 0
160
+ else:
161
+ log_message("�{�����楢��", "ERROR")
162
+ return 1
163
+
164
+ except KeyboardInterrupt:
165
+ log_message("�ϥΪ̤��_�{������", "WARNING")
166
+ return 1
167
+ except Exception as e:
168
+ log_message(f"�{������ɵo�ͥ��w�����~: {e}", "ERROR")
169
+ return 1
170
+
171
+ if __name__ == "__main__":
172
+ # �]�m�{����T
173
+ print("? 591���θ�Ƥ��R��")
174
+ print("? �ؼаϰ�: ���������s��")
175
+ print("? ��������: 2�СB��h�B�q��j��")
176
+ print("? ��X Hugging Face �ͺA�t��")
177
+ print("-" * 50)
178
+
179
+ exit_code = main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ requests>=2.31.0
3
+ beautifulsoup4>=4.12.0
4
+ pandas>=2.0.0
5
+ numpy>=1.24.0
6
+ matplotlib>=3.7.0
7
+ seaborn>=0.12.0
8
+ transformers>=4.30.0
9
+ datasets>=2.14.0
10
+ plotly>=5.15.0
11
+ jupyter>=1.0.0
12
+ lxml>=4.9.0
13
+ selenium>=4.10.0
14
+ webdriver-manager>=3.8.0
scraper.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ import requests
3
+ import time
4
+ import json
5
+ import pandas as pd
6
+ from bs4 import BeautifulSoup
7
+ from selenium import webdriver
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.chrome.service import Service
10
+ from selenium.webdriver.chrome.options import Options
11
+ from webdriver_manager.chrome import ChromeDriverManager
12
+ import re
13
+ from typing import List, Dict, Optional
14
+
15
+ class Rent591Scraper:
16
+ """591����������O"""
17
+
18
+ def __init__(self):
19
+ self.base_url = "https://rent.591.com.tw"
20
+ self.headers = {
21
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
22
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
23
+ 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
24
+ 'Accept-Encoding': 'gzip, deflate, br',
25
+ 'Connection': 'keep-alive',
26
+ 'Upgrade-Insecure-Requests': '1',
27
+ }
28
+ self.session = requests.Session()
29
+ self.session.headers.update(self.headers)
30
+
31
+ def setup_driver(self):
32
+ """�]�mChrome WebDriver"""
33
+ chrome_options = Options()
34
+ chrome_options.add_argument('--headless') # �L�Y�Ҧ�
35
+ chrome_options.add_argument('--no-sandbox')
36
+ chrome_options.add_argument('--disable-dev-shm-usage')
37
+ chrome_options.add_argument('--disable-gpu')
38
+ chrome_options.add_argument('--window-size=1920,1080')
39
+ chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}')
40
+
41
+ service = Service(ChromeDriverManager().install())
42
+ driver = webdriver.Chrome(service=service, options=chrome_options)
43
+ return driver
44
+
45
+ def get_csrf_token(self, driver):
46
+ """���CSRF Token"""
47
+ try:
48
+ # �X�ݭ������token
49
+ driver.get("https://rent.591.com.tw/")
50
+ time.sleep(2)
51
+
52
+ # ���ձq����������token
53
+ token_element = driver.find_element(By.NAME, "csrf-token")
54
+ if token_element:
55
+ return token_element.get_attribute("content")
56
+
57
+ # �p�G�S���A���ձqcookies�����
58
+ cookies = driver.get_cookies()
59
+ for cookie in cookies:
60
+ if 'token' in cookie['name'].lower():
61
+ return cookie['value']
62
+
63
+ except Exception as e:
64
+ print(f"���token����: {e}")
65
+
66
+ return None
67
+
68
+ def scrape_rental_data(self, max_pages: int = 10) -> List[Dict]:
69
+ """
70
+ ��������
71
+
72
+ Args:
73
+ max_pages: �̤j��������
74
+
75
+ Returns:
76
+ ���θ�ƦC��
77
+ """
78
+ driver = self.setup_driver()
79
+ all_data = []
80
+
81
+ try:
82
+ # �ؼ�URL�Ѽ�
83
+ params = {
84
+ 'region': '17', # ������
85
+ 'section': '247', # ���s��
86
+ 'kind': '1', # ��h���a
87
+ 'layout': '2', # 2��
88
+ 'shape': '2' # �q��j��
89
+ }
90
+
91
+ for page in range(1, max_pages + 1):
92
+ print(f"���b������ {page} ��...")
93
+
94
+ # �c��URL
95
+ params['page'] = page
96
+ url = f"{self.base_url}/list?" + "&".join([f"{k}={v}" for k, v in params.items()])
97
+
98
+ driver.get(url)
99
+ time.sleep(3) # ���ݭ������J
100
+
101
+ # �ˬd�O�_�����
102
+ rental_items = driver.find_elements(By.CSS_SELECTOR, '.rent-item')
103
+ if not rental_items:
104
+ print(f"�� {page} ���S������ơA�����")
105
+ break
106
+
107
+ page_data = self.parse_page_data(driver)
108
+ all_data.extend(page_data)
109
+
110
+ print(f"�� {page} ����� {len(page_data)} �����")
111
+
112
+ # �קK�Q��IP�A�[�J����
113
+ time.sleep(2)
114
+
115
+ except Exception as e:
116
+ print(f"������Ʈɵo�Ϳ��~: {e}")
117
+ finally:
118
+ driver.quit()
119
+
120
+ return all_data
121
+
122
+ def parse_page_data(self, driver) -> List[Dict]:
123
+ """�ѪR�歶���"""
124
+ page_data = []
125
+
126
+ try:
127
+ # �������HTML
128
+ soup = BeautifulSoup(driver.page_source, 'html.parser')
129
+
130
+ # �d�䯲�ζ���
131
+ rental_items = soup.find_all('div', class_='rent-item')
132
+
133
+ for item in rental_items:
134
+ try:
135
+ rental_info = self.extract_rental_info(item)
136
+ if rental_info:
137
+ page_data.append(rental_info)
138
+ except Exception as e:
139
+ print(f"�ѪR�浧��Ʈɵo�Ϳ��~: {e}")
140
+ continue
141
+
142
+ except Exception as e:
143
+ print(f"�ѪR������Ʈɵo�Ϳ��~: {e}")
144
+
145
+ return page_data
146
+
147
+ def extract_rental_info(self, item) -> Optional[Dict]:
148
+ """�����浧���θ�T"""
149
+ try:
150
+ # �򥻸�T
151
+ title_elem = item.find('h3', class_='rent-item-title')
152
+ title = title_elem.get_text(strip=True) if title_elem else "N/A"
153
+
154
+ # ����
155
+ price_elem = item.find('div', class_='rent-item-price')
156
+ price_text = price_elem.get_text(strip=True) if price_elem else "0"
157
+ price = self.extract_price(price_text)
158
+
159
+ # �a�}
160
+ address_elem = item.find('div', class_='rent-item-address')
161
+ address = address_elem.get_text(strip=True) if address_elem else "N/A"
162
+
163
+ # �ԲӸ�T
164
+ info_elem = item.find('div', class_='rent-item-info')
165
+ info_text = info_elem.get_text(strip=True) if info_elem else ""
166
+
167
+ # �����W�ơB�Ӽh����T
168
+ area = self.extract_area(info_text)
169
+ floor = self.extract_floor(info_text)
170
+
171
+ # �s��
172
+ link_elem = item.find('a')
173
+ link = self.base_url + link_elem.get('href') if link_elem and link_elem.get('href') else ""
174
+
175
+ return {
176
+ 'title': title,
177
+ 'price': price,
178
+ 'address': address,
179
+ 'area': area,
180
+ 'floor': floor,
181
+ 'link': link,
182
+ 'raw_info': info_text
183
+ }
184
+
185
+ except Exception as e:
186
+ print(f"�������θ�T�ɵo�Ϳ��~: {e}")
187
+ return None
188
+
189
+ def extract_price(self, price_text: str) -> int:
190
+ """���������Ʀr"""
191
+ try:
192
+ # �����D�Ʀr�r�šA��������
193
+ price_match = re.search(r'[\d,]+', price_text.replace(',', ''))
194
+ if price_match:
195
+ return int(price_match.group().replace(',', ''))
196
+ except:
197
+ pass
198
+ return 0
199
+
200
+ def extract_area(self, info_text: str) -> float:
201
+ """�����W��"""
202
+ try:
203
+ area_match = re.search(r'(\d+(?:\.\d+)?)\s*�W', info_text)
204
+ if area_match:
205
+ return float(area_match.group(1))
206
+ except:
207
+ pass
208
+ return 0.0
209
+
210
+ def extract_floor(self, info_text: str) -> str:
211
+ """�����Ӽh��T"""
212
+ try:
213
+ floor_match = re.search(r'(\d+)��', info_text)
214
+ if floor_match:
215
+ return floor_match.group(1) + '��'
216
+ except:
217
+ pass
218
+ return "N/A"
219
+
220
+ def save_data(self, data: List[Dict], filename: str = "rental_data.json"):
221
+ """�x�s��ƨ��ɮ�"""
222
+ try:
223
+ with open(f"output/{filename}", 'w', encoding='utf-8') as f:
224
+ json.dump(data, f, ensure_ascii=False, indent=2)
225
+ print(f"��Ƥw�x�s�� output/{filename}")
226
+ except Exception as e:
227
+ print(f"�x�s��Ʈɵo�Ϳ��~: {e}")
228
+
229
+ def to_dataframe(self, data: List[Dict]) -> pd.DataFrame:
230
+ """�ഫ��DataFrame"""
231
+ return pd.DataFrame(data)
232
+
233
+ if __name__ == "__main__":
234
+ scraper = Rent591Scraper()
235
+ print("�}�l����591����...")
236
+
237
+ # �������
238
+ rental_data = scraper.scrape_rental_data(max_pages=5)
239
+
240
+ if rental_data:
241
+ print(f"�`�@������ {len(rental_data)} �����")
242
+
243
+ # �x�s��l���
244
+ scraper.save_data(rental_data)
245
+
246
+ # �ഫ��DataFrame���x�sCSV
247
+ df = scraper.to_dataframe(rental_data)
248
+ df.to_csv("output/rental_data.csv", index=False, encoding='utf-8-sig')
249
+
250
+ print("��ƪ��������I")
251
+ print(df.head())
252
+ else:
253
+ print("�S�������������")
utils.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ import time
3
+ import json
4
+ from datetime import datetime
5
+
6
+ def log_message(message: str, level: str = "INFO"):
7
+ """�O����x�T��"""
8
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
9
+ print(f"[{timestamp}] {level}: {message}")
10
+
11
+ def save_json(data, filename: str, output_dir: str = "output"):
12
+ """�x�sJSON�榡���"""
13
+ try:
14
+ filepath = f"{output_dir}/{filename}"
15
+ with open(filepath, 'w', encoding='utf-8') as f:
16
+ json.dump(data, f, ensure_ascii=False, indent=2)
17
+ log_message(f"��Ƥw�x�s�� {filepath}")
18
+ return True
19
+ except Exception as e:
20
+ log_message(f"�x�s��Ʈɵo�Ϳ��~: {e}", "ERROR")
21
+ return False
22
+
23
+ def load_json(filename: str, output_dir: str = "output"):
24
+ """���JJSON�榡���"""
25
+ try:
26
+ filepath = f"{output_dir}/{filename}"
27
+ with open(filepath, 'r', encoding='utf-8') as f:
28
+ data = json.load(f)
29
+ log_message(f"���\���J {filepath}")
30
+ return data
31
+ except Exception as e:
32
+ log_message(f"���J��Ʈɵo�Ϳ��~: {e}", "ERROR")
33
+ return None
34
+
35
+ def format_currency(amount: float) -> str:
36
+ """�榡�ƪ��B���"""
37
+ if amount >= 10000:
38
+ return f"{amount:,.0f}"
39
+ else:
40
+ return f"{amount:.0f}"
41
+
42
+ def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
43
+ """�w�����k�A�קK���s���~"""
44
+ try:
45
+ if denominator == 0:
46
+ return default
47
+ return numerator / denominator
48
+ except:
49
+ return default
50
+
51
+ def clean_text(text: str) -> str:
52
+ """�M�z��r���e"""
53
+ if not text:
54
+ return ""
55
+
56
+ # �����h�l�ť�
57
+ text = " ".join(text.split())
58
+
59
+ # �����S���r�š]�O�d����B�^��B�Ʀr�M�`�μ��I�^
60
+ import re
61
+ text = re.sub(r'[^\u4e00-\u9fff\w\s.,!?;:()�]�^�i�j�u�v\-]', '', text)
62
+
63
+ return text.strip()
64
+
65
+ def retry_request(func, max_retries: int = 3, delay: float = 1.0):
66
+ """���վ���"""
67
+ for attempt in range(max_retries):
68
+ try:
69
+ return func()
70
+ except Exception as e:
71
+ if attempt == max_retries - 1:
72
+ raise e
73
+ log_message(f"�ШD���ѡA{delay}���᭫��... (���� {attempt + 1}/{max_retries})", "WARNING")
74
+ time.sleep(delay)
75
+ delay *= 2 # ���ưh��
76
+
77
+ def validate_price(price_str: str) -> bool:
78
+ """���һ���榡�O�_�X�z"""
79
+ try:
80
+ import re
81
+ # �����Ʀr
82
+ price_match = re.search(r'[\d,]+', price_str.replace(',', ''))
83
+ if price_match:
84
+ price = int(price_match.group().replace(',', ''))
85
+ # �X�z�������d��G5000 - 100000
86
+ return 5000 <= price <= 100000
87
+ except:
88
+ pass
89
+ return False
90
+
91
+ def validate_area(area_str: str) -> bool:
92
+ """���ҩW�Ʈ榡�O�_�X�z"""
93
+ try:
94
+ import re
95
+ area_match = re.search(r'(\d+(?:\.\d+)?)', area_str)
96
+ if area_match:
97
+ area = float(area_match.group(1))
98
+ # �X�z���W�ƽd��G10 - 100�W
99
+ return 10 <= area <= 100
100
+ except:
101
+ pass
102
+ return False
103
+
104
+ def create_output_directories():
105
+ """�Ыؿ�X�ؿ�"""
106
+ import os
107
+ directories = ['output', 'output/images', 'output/data', 'output/reports']
108
+
109
+ for directory in directories:
110
+ if not os.path.exists(directory):
111
+ os.makedirs(directory)
112
+ log_message(f"�Ыإؿ�: {directory}")
113
+
114
+ def get_current_timestamp() -> str:
115
+ """������e�ɶ��W"""
116
+ return datetime.now().strftime("%Y%m%d_%H%M%S")
117
+
118
+ def calculate_statistics(data_list):
119
+ """�p��έp�ƾ�"""
120
+ if not data_list:
121
+ return {}
122
+
123
+ import numpy as np
124
+
125
+ data_array = np.array(data_list)
126
+
127
+ return {
128
+ 'count': len(data_array),
129
+ 'mean': float(np.mean(data_array)),
130
+ 'median': float(np.median(data_array)),
131
+ 'std': float(np.std(data_array)),
132
+ 'min': float(np.min(data_array)),
133
+ 'max': float(np.max(data_array)),
134
+ 'q25': float(np.percentile(data_array, 25)),
135
+ 'q75': float(np.percentile(data_array, 75))
136
+ }
137
+
138
+ def progress_bar(current: int, total: int, length: int = 50):
139
+ """��ܶi�ױ�"""
140
+ if total == 0:
141
+ return
142
+
143
+ percent = (current / total) * 100
144
+ filled = int(length * current // total)
145
+ bar = '�i' * filled + '-' * (length - filled)
146
+
147
+ print(f'\r�i��: |{bar}| {percent:.1f}% ({current}/{total})', end='', flush=True)
148
+
149
+ if current >= total:
150
+ print() # �����ᴫ��
visualizer.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # �� Copilot �ͦ�
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ import pandas as pd
5
+ import numpy as np
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+ import json
10
+ from typing import Dict, List
11
+
12
+ # �]�w����r��
13
+ plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei', 'SimHei', 'Arial Unicode MS']
14
+ plt.rcParams['axes.unicode_minus'] = False
15
+
16
+ class RentalDataVisualizer:
17
+ """���θ�Ƶ�ı�ƾ�"""
18
+
19
+ def __init__(self, df: pd.DataFrame = None, analysis_results: Dict = None):
20
+ """
21
+ ��l�Ƶ�ı�ƾ�
22
+
23
+ Args:
24
+ df: ���DataFrame
25
+ analysis_results: ���R���G�r��
26
+ """
27
+ self.df = df
28
+ self.analysis_results = analysis_results
29
+ self.colors = px.colors.qualitative.Set3
30
+
31
+ def load_data(self, data_path: str):
32
+ """���J���"""
33
+ try:
34
+ if data_path.endswith('.csv'):
35
+ self.df = pd.read_csv(data_path, encoding='utf-8-sig')
36
+ else:
37
+ raise ValueError("�д���CSV�榡������ɮ�")
38
+ print(f"���\���J {len(self.df)} ����ƥΩ��ı��")
39
+ except Exception as e:
40
+ print(f"���J��Ʈɵo�Ϳ��~: {e}")
41
+
42
+ def load_analysis_results(self, results_path: str):
43
+ """���J���R���G"""
44
+ try:
45
+ with open(results_path, 'r', encoding='utf-8') as f:
46
+ self.analysis_results = json.load(f)
47
+ print("���R���G���J���\")
48
+ except Exception as e:
49
+ print(f"���J���R���G�ɵo�Ϳ��~: {e}")
50
+
51
+ def plot_price_distribution(self, save_path: str = "output/price_distribution.png"):
52
+ """ø�s����������"""
53
+ if self.df is None or 'price' not in self.df.columns:
54
+ print("�L�kø�s���������ϡG�ʤָ��")
55
+ return
56
+
57
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
58
+
59
+ # �����
60
+ ax1.hist(self.df['price'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
61
+ ax1.set_xlabel('���� (��)')
62
+ ax1.set_ylabel('����ƶq')
63
+ ax1.set_title('�������������')
64
+ ax1.grid(True, alpha=0.3)
65
+
66
+ # �c�ι�
67
+ ax2.boxplot(self.df['price'], vert=True, patch_artist=True,
68
+ boxprops=dict(facecolor='lightgreen', alpha=0.7))
69
+ ax2.set_ylabel('���� (��)')
70
+ ax2.set_title('���������c�ι�')
71
+ ax2.grid(True, alpha=0.3)
72
+
73
+ plt.tight_layout()
74
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
75
+ plt.close()
76
+ print(f"���������Ϥw�x�s: {save_path}")
77
+
78
+ def plot_price_ranges(self, save_path: str = "output/price_ranges.png"):
79
+ """ø�s�����϶�������"""
80
+ if not self.analysis_results or 'price_distribution' not in self.analysis_results:
81
+ print("�L�kø�s�����϶��ϡG�ʤ֤��R���G")
82
+ return
83
+
84
+ dist_data = self.analysis_results['price_distribution']
85
+
86
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
87
+
88
+ # ������
89
+ bars = ax1.bar(dist_data['ranges'], dist_data['counts'],
90
+ color=self.colors[:len(dist_data['ranges'])], alpha=0.8)
91
+ ax1.set_xlabel('�����϶�')
92
+ ax1.set_ylabel('����ƶq')
93
+ ax1.set_title('�U�����϶�����ƶq')
94
+ ax1.tick_params(axis='x', rotation=45)
95
+
96
+ # �b�����W��ܼƭ�
97
+ for bar, count in zip(bars, dist_data['counts']):
98
+ height = bar.get_height()
99
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5,
100
+ f'{count}', ha='center', va='bottom')
101
+
102
+ # ����
103
+ ax2.pie(dist_data['percentages'], labels=dist_data['ranges'], autopct='%1.1f%%',
104
+ colors=self.colors[:len(dist_data['ranges'])], startangle=90)
105
+ ax2.set_title('�����϶���Ҥ���')
106
+
107
+ plt.tight_layout()
108
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
109
+ plt.close()
110
+ print(f"�����϶��Ϥw�x�s: {save_path}")
111
+
112
+ def plot_area_analysis(self, save_path: str = "output/area_analysis.png"):
113
+ """ø�s�W�Ƥ��R��"""
114
+ if self.df is None or 'area' not in self.df.columns:
115
+ print("�L�kø�s�W�Ƥ��R�ϡG�ʤָ��")
116
+ return
117
+
118
+ # �����ŭ�
119
+ area_data = self.df['area'].dropna()
120
+
121
+ if len(area_data) == 0:
122
+ print("�L�kø�s�W�Ƥ��R�ϡG�S�����Ī��W�Ƹ��")
123
+ return
124
+
125
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
126
+
127
+ # ���I�� - �W�ƻP�������Y
128
+ if 'price' in self.df.columns:
129
+ valid_data = self.df.dropna(subset=['area', 'price'])
130
+ if len(valid_data) > 0:
131
+ ax1.scatter(valid_data['area'], valid_data['price'],
132
+ alpha=0.6, color='coral', s=50)
133
+ ax1.set_xlabel('�W��')
134
+ ax1.set_ylabel('���� (��)')
135
+ ax1.set_title('�W�ƻP�������Y')
136
+ ax1.grid(True, alpha=0.3)
137
+
138
+ # �K�[�Ͷսu
139
+ z = np.polyfit(valid_data['area'], valid_data['price'], 1)
140
+ p = np.poly1d(z)
141
+ ax1.plot(valid_data['area'], p(valid_data['area']), "r--", alpha=0.8)
142
+
143
+ # �W�Ƥ��������
144
+ ax2.hist(area_data, bins=15, alpha=0.7, color='lightgreen', edgecolor='black')
145
+ ax2.set_xlabel('�W��')
146
+ ax2.set_ylabel('����ƶq')
147
+ ax2.set_title('�W�Ƥ���')
148
+ ax2.grid(True, alpha=0.3)
149
+
150
+ plt.tight_layout()
151
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
152
+ plt.close()
153
+ print(f"�W�Ƥ��R�Ϥw�x�s: {save_path}")
154
+
155
+ def plot_price_per_ping(self, save_path: str = "output/price_per_ping.png"):
156
+ """ø�s�C�W�������R��"""
157
+ if self.df is None or 'price_per_ping' not in self.df.columns:
158
+ print("�L�kø�s�C�W�����ϡG�ʤָ��")
159
+ return
160
+
161
+ price_per_ping_data = self.df['price_per_ping'].dropna()
162
+
163
+ if len(price_per_ping_data) == 0:
164
+ print("�L�kø�s�C�W�����ϡG�S�����Ī��C�W�������")
165
+ return
166
+
167
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
168
+
169
+ # �C�W��������
170
+ ax1.hist(price_per_ping_data, bins=20, alpha=0.7, color='gold', edgecolor='black')
171
+ ax1.set_xlabel('�C�W���� (��/�W)')
172
+ ax1.set_ylabel('����ƶq')
173
+ ax1.set_title('�C�W��������')
174
+ ax1.grid(True, alpha=0.3)
175
+
176
+ # �c�ι�
177
+ ax2.boxplot(price_per_ping_data, vert=True, patch_artist=True,
178
+ boxprops=dict(facecolor='orange', alpha=0.7))
179
+ ax2.set_ylabel('�C�W���� (��/�W)')
180
+ ax2.set_title('�C�W�����c�ι�')
181
+ ax2.grid(True, alpha=0.3)
182
+
183
+ plt.tight_layout()
184
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
185
+ plt.close()
186
+ print(f"�C�W�����Ϥw�x�s: {save_path}")
187
+
188
+ def plot_keywords_analysis(self, save_path: str = "output/keywords_analysis.png"):
189
+ """ø�s����r���R��"""
190
+ if not self.analysis_results or 'description_analysis' not in self.analysis_results:
191
+ print("�L�kø�s����r���R�ϡG�ʤ֤��R���G")
192
+ return
193
+
194
+ desc_analysis = self.analysis_results['description_analysis']
195
+ if 'keywords_frequency' not in desc_analysis:
196
+ print("�L�kø�s����r���R�ϡG�ʤ�����r���")
197
+ return
198
+
199
+ keywords_data = desc_analysis['keywords_frequency']
200
+
201
+ # �L�o�X���ƾڪ�����r
202
+ filtered_keywords = {k: v for k, v in keywords_data.items() if v > 0}
203
+
204
+ if not filtered_keywords:
205
+ print("�S������������r���")
206
+ return
207
+
208
+ keywords = list(filtered_keywords.keys())
209
+ frequencies = list(filtered_keywords.values())
210
+
211
+ plt.figure(figsize=(12, 8))
212
+ bars = plt.barh(keywords, frequencies, color=self.colors[:len(keywords)])
213
+ plt.xlabel('�X�{����')
214
+ plt.ylabel('����r')
215
+ plt.title('����y�z����r�W�v���R')
216
+ plt.grid(True, alpha=0.3, axis='x')
217
+
218
+ # �b�����W��ܼƭ�
219
+ for bar, freq in zip(bars, frequencies):
220
+ width = bar.get_width()
221
+ plt.text(width + 0.1, bar.get_y() + bar.get_height()/2.,
222
+ f'{freq}', ha='left', va='center')
223
+
224
+ plt.tight_layout()
225
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
226
+ plt.close()
227
+ print(f"����r���R�Ϥw�x�s: {save_path}")
228
+
229
+ def create_interactive_dashboard(self, save_path: str = "output/dashboard.html"):
230
+ """�Ыؤ��ʦ������O"""
231
+ if self.df is None:
232
+ print("�L�k�Ыػ����O�G�ʤָ��")
233
+ return
234
+
235
+ # �Ыؤl��
236
+ fig = make_subplots(
237
+ rows=2, cols=2,
238
+ subplot_titles=('��������', '�W��vs����', '�����϶�����', '�C�W��������'),
239
+ specs=[[{"secondary_y": False}, {"secondary_y": False}],
240
+ [{"type": "bar"}, {"secondary_y": False}]]
241
+ )
242
+
243
+ # 1. �������������
244
+ fig.add_trace(
245
+ go.Histogram(x=self.df['price'], name='��������', nbinsx=20,
246
+ marker_color='skyblue', opacity=0.7),
247
+ row=1, col=1
248
+ )
249
+
250
+ # 2. �W��vs�������I��
251
+ if 'area' in self.df.columns:
252
+ valid_data = self.df.dropna(subset=['area', 'price'])
253
+ if len(valid_data) > 0:
254
+ fig.add_trace(
255
+ go.Scatter(x=valid_data['area'], y=valid_data['price'],
256
+ mode='markers', name='�W��vs����',
257
+ marker=dict(color='coral', size=8, opacity=0.6)),
258
+ row=1, col=2
259
+ )
260
+
261
+ # 3. �����϶�����
262
+ if self.analysis_results and 'price_distribution' in self.analysis_results:
263
+ dist_data = self.analysis_results['price_distribution']
264
+ fig.add_trace(
265
+ go.Bar(x=dist_data['ranges'], y=dist_data['counts'],
266
+ name='�����϶�', marker_color='lightgreen'),
267
+ row=2, col=1
268
+ )
269
+
270
+ # 4. �C�W��������
271
+ if 'price_per_ping' in self.df.columns:
272
+ price_per_ping_data = self.df['price_per_ping'].dropna()
273
+ if len(price_per_ping_data) > 0:
274
+ fig.add_trace(
275
+ go.Histogram(x=price_per_ping_data, name='�C�W����', nbinsx=15,
276
+ marker_color='gold', opacity=0.7),
277
+ row=2, col=2
278
+ )
279
+
280
+ # ��s����
281
+ fig.update_layout(
282
+ title_text="���������s�ϯ��Υ������R�����O",
283
+ title_x=0.5,
284
+ height=800,
285
+ showlegend=False
286
+ )
287
+
288
+ # ��s�b����
289
+ fig.update_xaxes(title_text="���� (��)", row=1, col=1)
290
+ fig.update_yaxes(title_text="����ƶq", row=1, col=1)
291
+ fig.update_xaxes(title_text="�W��", row=1, col=2)
292
+ fig.update_yaxes(title_text="���� (��)", row=1, col=2)
293
+ fig.update_xaxes(title_text="�����϶�", row=2, col=1)
294
+ fig.update_yaxes(title_text="����ƶq", row=2, col=1)
295
+ fig.update_xaxes(title_text="�C�W���� (��/�W)", row=2, col=2)
296
+ fig.update_yaxes(title_text="����ƶq", row=2, col=2)
297
+
298
+ # �x�s���ʦ��Ϫ�
299
+ fig.write_html(save_path)
300
+ print(f"���ʦ������O�w�x�s: {save_path}")
301
+
302
+ def generate_all_visualizations(self):
303
+ """�ͦ��Ҧ���ı�ƹϪ�"""
304
+ print("�}�l�ͦ���ı�ƹϪ�...")
305
+
306
+ # �R�A�Ϫ�
307
+ self.plot_price_distribution()
308
+ self.plot_price_ranges()
309
+ self.plot_area_analysis()
310
+ self.plot_price_per_ping()
311
+ self.plot_keywords_analysis()
312
+
313
+ # ���ʦ������O
314
+ self.create_interactive_dashboard()
315
+
316
+ print("�Ҧ���ı�ƹϪ��ͦ������I")
317
+
318
+ def create_summary_report(self, save_path: str = "output/summary_report.png"):
319
+ """�ЫغK�n���i��"""
320
+ if not self.analysis_results or 'basic_stats' not in self.analysis_results:
321
+ print("�L�k�ЫغK�n���i�G�ʤ֤��R���G")
322
+ return
323
+
324
+ fig, ax = plt.subplots(figsize=(12, 8))
325
+ ax.axis('off')
326
+
327
+ # ���D
328
+ fig.suptitle('���������s�ϯ��Υ������R�K�n���i', fontsize=20, fontweight='bold', y=0.95)
329
+
330
+ # �򥻲έp��T
331
+ stats = self.analysis_results['basic_stats']
332
+
333
+ # �Ыؤ�r���e
334
+ report_text = f"""
335
+
336
+ ? �������p
337
+ ? �`�����: {stats['total_properties']} ��
338
+ ? ��ƽd��: 2�СB��h�B�q��j��
339
+
340
+ ? �����έp
341
+ ? ��������: {stats['price_stats']['mean']:,} ��
342
+ ? ����Ư���: {stats['price_stats']['median']:,} ��
343
+ ? �̧C����: {stats['price_stats']['min']:,} ��
344
+ ? �̰�����: {stats['price_stats']['max']:,} ��
345
+ ? �зǮt: {stats['price_stats']['std']:,} ��
346
+
347
+ ? �����S�x
348
+ ? �Ĥ@�|�����: {stats['price_stats']['q25']:,} ��
349
+ ? �ĤT�|�����: {stats['price_stats']['q75']:,} ��
350
+ """
351
+
352
+ # �K�[���n�έp�]�p�G�����ܡ^
353
+ if 'area_stats' in stats and stats['area_stats']:
354
+ area_stats = stats['area_stats']
355
+ report_text += f"""
356
+ ? �W�Ʋέp
357
+ ? �����W��: {area_stats['mean']} �W
358
+ ? ����ƩW��: {area_stats['median']} �W
359
+ ? �̤p�W��: {area_stats['min']} �W
360
+ ? �̤j�W��: {area_stats['max']} �W
361
+ """
362
+
363
+ # �K�[�C�W�����έp�]�p�G�����ܡ^
364
+ if 'price_per_ping_stats' in stats and stats['price_per_ping_stats']:
365
+ pp_stats = stats['price_per_ping_stats']
366
+ report_text += f"""
367
+ ? �C�W�����έp
368
+ ? �����C�W����: {pp_stats['mean']:,} ��/�W
369
+ ? ����ƨC�W����: {pp_stats['median']:,} ��/�W
370
+ ? �̧C�C�W����: {pp_stats['min']:,} ��/�W
371
+ ? �̰��C�W����: {pp_stats['max']:,} ��/�W
372
+ """
373
+
374
+ # �K�[�}��]�p�G�����ܡ^
375
+ if 'insights' in self.analysis_results:
376
+ report_text += "\n\n? ���n�}��\n"
377
+ for i, insight in enumerate(self.analysis_results['insights'], 1):
378
+ report_text += f"? {insight}\n"
379
+
380
+ # ��ܤ�r
381
+ ax.text(0.05, 0.95, report_text, transform=ax.transAxes, fontsize=12,
382
+ verticalalignment='top', fontfamily='monospace',
383
+ bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))
384
+
385
+ plt.tight_layout()
386
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
387
+ plt.close()
388
+ print(f"�K�n���i�w�x�s: {save_path}")
389
+
390
+ if __name__ == "__main__":
391
+ # ���յ�ı�ƾ�
392
+ visualizer = RentalDataVisualizer()
393
+
394
+ # ���J���
395
+ visualizer.load_data("output/rental_data.csv")
396
+ visualizer.load_analysis_results("output/analysis_results.json")
397
+
398
+ # �ͦ��Ҧ���ı�ƹϪ�
399
+ visualizer.generate_all_visualizations()
400
+
401
+ # �ЫغK�n���i
402
+ visualizer.create_summary_report()