Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import os | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| # Regex để bắt điểm (ví dụ: 7 hoặc 7.5 hoặc 6.0) | |
| FLOAT_RE = r"(\d+(?:\.\d+)?)" | |
| def to_float_safe(x): | |
| """Chuyển đổi an toàn sang float, nếu lỗi trả về None""" | |
| try: | |
| val = float(x) | |
| # Kiểm tra điểm hợp lệ (0-9) | |
| if 0 <= val <= 9: | |
| return val | |
| return None | |
| except Exception: | |
| return None | |
| def parse_chillies_dataset(dataset): | |
| """ | |
| Parser cho 'chillies/IELTS-writing-task-2-evaluation'. | |
| Format: **Task Achievement: [7]** hoặc **Overall Band Score: [7.5]** | |
| """ | |
| print("Đang xử lý dataset 'chillies'...") | |
| cleaned = [] | |
| bad_examples = 0 | |
| patterns = { | |
| "task_response": re.compile( | |
| r"\*\*Task Achievement:\s*\[?(" + FLOAT_RE + r")\]?\*\*", | |
| re.I | |
| ), | |
| "coherence_cohesion": re.compile( | |
| r"\*\*Coherence and Cohesion:\s*\[?(" + FLOAT_RE + r")\]?\*\*", | |
| re.I | |
| ), | |
| "lexical_resource": re.compile( | |
| r"\*\*Lexical Resource:\s*\[?(" + FLOAT_RE + r")\]?\*\*", | |
| re.I | |
| ), | |
| "grammatical_range": re.compile( | |
| r"\*\*Grammatical Range and Accuracy:\s*\[?(" + FLOAT_RE + r")\]?\*\*", | |
| re.I | |
| ), | |
| } | |
| for item in tqdm(dataset, desc="Parsing chillies"): | |
| try: | |
| prompt = item.get('prompt', '').strip() | |
| essay = item.get('essay', '').strip() | |
| evaluation_text = item.get('evaluation', '') | |
| if not (prompt and essay and evaluation_text and len(essay) > 50): | |
| bad_examples += 1 | |
| continue | |
| scores = {} | |
| for key, pattern in patterns.items(): | |
| match = pattern.search(evaluation_text) | |
| if match: | |
| score_str = match.group(1) | |
| scores[key] = to_float_safe(score_str) | |
| else: | |
| scores[key] = None | |
| if all(scores.values()): | |
| standard_scores = { | |
| "task_response": scores["task_response"], | |
| "coherence_cohesion": scores["coherence_cohesion"], | |
| "lexical_resource": scores["lexical_resource"], | |
| "grammatical_range": scores["grammatical_range"] | |
| } | |
| cleaned.append({ | |
| "prompt_text": prompt, | |
| "essay_text": essay, | |
| "scores": standard_scores | |
| }) | |
| else: | |
| bad_examples += 1 | |
| except Exception: | |
| bad_examples += 1 | |
| print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") | |
| return cleaned | |
| def parse_123harr_dataset(dataset): | |
| """ | |
| Parser cho '123Harr/IELTS-WT2-LLaMa3-1k'. | |
| Lấy scores từ 'formatted' field | |
| """ | |
| print("Đang xử lý dataset '123Harr'...") | |
| cleaned = [] | |
| bad_examples = 0 | |
| prompt_essay_re = re.compile( | |
| r"<\|start_header_id\|>user<\|end_header_id\|>\n\n(.*?)<\|eot_id\|>", | |
| re.S | |
| ) | |
| score_patterns = { | |
| "task_response": re.compile( | |
| r"(?:###|##|\*\*)?Task Achievement(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", | |
| re.I | re.M | |
| ), | |
| "coherence_cohesion": re.compile( | |
| r"(?:###|##|\*\*)?Coherence and Cohesion(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", | |
| re.I | re.M | |
| ), | |
| "lexical_resource": re.compile( | |
| r"(?:###|##|\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", | |
| re.I | re.M | |
| ), | |
| "grammatical_range": re.compile( | |
| r"(?:###|##|\*\*)?Grammatical Range and Accuracy(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", | |
| re.I | re.M | |
| ), | |
| } | |
| for item in tqdm(dataset, desc="Parsing 123Harr"): | |
| try: | |
| formatted_text = item.get('formatted', '') | |
| if not formatted_text: | |
| bad_examples += 1 | |
| continue | |
| matches = prompt_essay_re.findall(formatted_text) | |
| if len(matches) < 2: | |
| bad_examples += 1 | |
| continue | |
| prompt = matches[0].strip() | |
| essay = matches[1].strip() | |
| if not prompt or not essay or len(essay) < 50: | |
| bad_examples += 1 | |
| continue | |
| scores = {} | |
| for key, pattern in score_patterns.items(): | |
| match = pattern.search(formatted_text) | |
| if match: | |
| score_str = match.group(match.lastindex) if match.lastindex else match.group(1) | |
| scores[key] = to_float_safe(score_str) | |
| else: | |
| scores[key] = None | |
| if all(scores.values()): | |
| standard_scores = { | |
| "task_response": scores["task_response"], | |
| "coherence_cohesion": scores["coherence_cohesion"], | |
| "lexical_resource": scores["lexical_resource"], | |
| "grammatical_range": scores["grammatical_range"] | |
| } | |
| cleaned.append({ | |
| "prompt_text": prompt, | |
| "essay_text": essay, | |
| "scores": standard_scores | |
| }) | |
| else: | |
| bad_examples += 1 | |
| except Exception: | |
| bad_examples += 1 | |
| print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") | |
| return cleaned | |
| def parse_dpo_dataset(dataset): | |
| """ | |
| Parser cho 'chillies/DPO_ielts_writing'. | |
| """ | |
| print("Đang xử lý dataset 'DPO'...") | |
| cleaned = [] | |
| bad_examples = 0 | |
| patterns_primary = { | |
| "task_response": re.compile( | |
| r"##\s*Task Achievement:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, | |
| re.I | |
| ), | |
| "coherence_cohesion": re.compile( | |
| r"##\s*Coherence and Cohesion:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, | |
| re.I | |
| ), | |
| "lexical_resource": re.compile( | |
| r"##\s*Lexical Resource(?:\s*\(Vocabulary\))?:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, | |
| re.I | |
| ), | |
| "grammatical_range": re.compile( | |
| r"##\s*Grammatical Range and Accuracy:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, | |
| re.I | |
| ), | |
| } | |
| patterns_fallback = { | |
| "task_response": re.compile(r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*" + FLOAT_RE, re.I), | |
| "coherence_cohesion": re.compile(r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*" + FLOAT_RE, re.I), | |
| "lexical_resource": re.compile(r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*" + FLOAT_RE, re.I), | |
| "grammatical_range": re.compile(r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*" + FLOAT_RE, re.I), | |
| } | |
| for item in tqdm(dataset, desc="Parsing DPO"): | |
| try: | |
| prompt = item.get('prompt', '').strip() | |
| essay = item.get('essay', '').strip() | |
| chosen_text = item.get('chosen', '') | |
| if not (prompt and essay and chosen_text and len(essay) > 50): | |
| bad_examples += 1 | |
| continue | |
| scores = {} | |
| for key, pattern in patterns_primary.items(): | |
| match = pattern.search(chosen_text) | |
| if match: | |
| scores[key] = to_float_safe(match.group(1)) | |
| else: | |
| scores[key] = None | |
| if not all(scores.values()): | |
| scores = {} | |
| for key, pattern in patterns_fallback.items(): | |
| match = pattern.search(chosen_text) | |
| if match: | |
| scores[key] = to_float_safe(match.group(1)) | |
| else: | |
| scores[key] = None | |
| if all(scores.values()): | |
| standard_scores = { | |
| "task_response": scores["task_response"], | |
| "coherence_cohesion": scores["coherence_cohesion"], | |
| "lexical_resource": scores["lexical_resource"], | |
| "grammatical_range": scores["grammatical_range"] | |
| } | |
| cleaned.append({ | |
| "prompt_text": prompt, | |
| "essay_text": essay, | |
| "scores": standard_scores | |
| }) | |
| else: | |
| bad_examples += 1 | |
| except Exception: | |
| bad_examples += 1 | |
| print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") | |
| return cleaned | |
| def parse_hadeel_dataset(dataset): | |
| """ | |
| Parser cho 'hadeelbkh/tokenized-IELTS-writing-task-2-evaluation'. | |
| """ | |
| print("Đang xử lý dataset 'hadeel'...") | |
| cleaned = [] | |
| bad_examples = 0 | |
| patterns = { | |
| "task_response": re.compile( | |
| r"(?:\*\*)?task achievement(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", | |
| re.I | |
| ), | |
| "coherence_cohesion": re.compile( | |
| r"(?:\*\*)?coherence and cohesion(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", | |
| re.I | |
| ), | |
| "lexical_resource": re.compile( | |
| r"(?:\*\*)?lexical resource(?:\s*\(vocabulary\))?(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", | |
| re.I | |
| ), | |
| "grammatical_range": re.compile( | |
| r"(?:\*\*)?grammatical range and accuracy(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", | |
| re.I | |
| ), | |
| } | |
| for item in tqdm(dataset, desc="Parsing hadeel"): | |
| try: | |
| prompt = item.get('prompt', '').strip() | |
| essay = item.get('essay', '').strip() | |
| evaluation_text = item.get('evaluation', '') | |
| if not (prompt and essay and evaluation_text and len(essay) > 50): | |
| bad_examples += 1 | |
| continue | |
| scores = {} | |
| for key, pattern in patterns.items(): | |
| match = pattern.search(evaluation_text) | |
| if match: | |
| score_str = match.group(1) | |
| scores[key] = to_float_safe(score_str) | |
| else: | |
| scores[key] = None | |
| if all(scores.values()): | |
| standard_scores = { | |
| "task_response": scores["task_response"], | |
| "coherence_cohesion": scores["coherence_cohesion"], | |
| "lexical_resource": scores["lexical_resource"], | |
| "grammatical_range": scores["grammatical_range"] | |
| } | |
| cleaned.append({ | |
| "prompt_text": prompt, | |
| "essay_text": essay, | |
| "scores": standard_scores | |
| }) | |
| else: | |
| bad_examples += 1 | |
| except Exception: | |
| bad_examples += 1 | |
| print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") | |
| return cleaned | |
| def parse_vietanh_dataset(dataset): | |
| """ | |
| Parser cho 'vietanh0802/ielts_writing_training_data_prepared'. | |
| Format: <s>[INST] ... ### Prompt: ... ### Essay: ... [/INST] ... | |
| """ | |
| print("Đang xử lý dataset 'vietanh'...") | |
| cleaned = [] | |
| bad_examples = 0 | |
| prompt_re = re.compile(r"### Prompt:\s*(.*?)(?=### Essay:|$)", re.S | re.I) | |
| essay_re = re.compile(r"### Essay:\s*(.*?)(?=\[/INST\]|$)", re.S | re.I) | |
| score_patterns = { | |
| "task_response": re.compile( | |
| r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", | |
| re.I | |
| ), | |
| "coherence_cohesion": re.compile( | |
| r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", | |
| re.I | |
| ), | |
| "lexical_resource": re.compile( | |
| r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", | |
| re.I | |
| ), | |
| "grammatical_range": re.compile( | |
| r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", | |
| re.I | |
| ), | |
| } | |
| for item in tqdm(dataset, desc="Parsing vietanh"): | |
| try: | |
| training_text = item.get('training_text', '') | |
| if not training_text: | |
| bad_examples += 1 | |
| continue | |
| prompt_match = prompt_re.search(training_text) | |
| if not prompt_match: | |
| bad_examples += 1 | |
| continue | |
| prompt = prompt_match.group(1).strip() | |
| essay_match = essay_re.search(training_text) | |
| if not essay_match: | |
| bad_examples += 1 | |
| continue | |
| essay = essay_match.group(1).strip() | |
| if not prompt or not essay or len(essay) < 50: | |
| bad_examples += 1 | |
| continue | |
| scores = {} | |
| for key, pattern in score_patterns.items(): | |
| match = pattern.search(training_text) | |
| if match: | |
| scores[key] = to_float_safe(match.group(1)) | |
| else: | |
| scores[key] = None | |
| if all(scores.values()): | |
| standard_scores = { | |
| "task_response": scores["task_response"], | |
| "coherence_cohesion": scores["coherence_cohesion"], | |
| "lexical_resource": scores["lexical_resource"], | |
| "grammatical_range": scores["grammatical_range"] | |
| } | |
| cleaned.append({ | |
| "prompt_text": prompt, | |
| "essay_text": essay, | |
| "scores": standard_scores | |
| }) | |
| else: | |
| bad_examples += 1 | |
| except Exception: | |
| bad_examples += 1 | |
| print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") | |
| return cleaned | |
| def main(): | |
| print("Đang tải các dataset từ Hugging Face...\n") | |
| cache_dir = "./.cache/huggingface_datasets" | |
| all_data = [] | |
| # Dataset 1: chillies/IELTS-writing-task-2-evaluation | |
| try: | |
| ds_chillies = load_dataset( | |
| "chillies/IELTS-writing-task-2-evaluation", | |
| split="train", | |
| cache_dir=cache_dir | |
| ) | |
| all_data.append(("chillies", parse_chillies_dataset(ds_chillies))) | |
| except Exception as e: | |
| print(f"✗ Lỗi tải chillies: {e}\n") | |
| # Dataset 2: 123Harr/IELTS-WT2-LLaMa3-1k | |
| try: | |
| ds_123harr = load_dataset( | |
| "123Harr/IELTS-WT2-LLaMa3-1k", | |
| split="train", | |
| cache_dir=cache_dir | |
| ) | |
| all_data.append(("123Harr", parse_123harr_dataset(ds_123harr))) | |
| except Exception as e: | |
| print(f"✗ Lỗi tải 123Harr: {e}\n") | |
| # Dataset 3: chillies/DPO_ielts_writing | |
| try: | |
| ds_chillies_2 = load_dataset( | |
| "chillies/DPO_ielts_writing", | |
| split="train", | |
| cache_dir=cache_dir | |
| ) | |
| all_data.append(("DPO", parse_dpo_dataset(ds_chillies_2))) | |
| except Exception as e: | |
| print(f"✗ Lỗi tải DPO: {e}\n") | |
| # Dataset 4: hadeelbkh/tokenized-IELTS-writing-task-2-evaluation | |
| try: | |
| ds_hadeel = load_dataset( | |
| "hadeelbkh/tokenized-IELTS-writing-task-2-evaluation-DialoGPT-medium", | |
| split="train", | |
| cache_dir=cache_dir | |
| ) | |
| all_data.append(("hadeel", parse_hadeel_dataset(ds_hadeel))) | |
| except Exception as e: | |
| print(f"✗ Lỗi tải hadeel: {e}\n") | |
| # Dataset 5: vietanh0802/ielts_writing_training_data_prepared | |
| try: | |
| ds_vietanh = load_dataset( | |
| "vietanh0802/ielts_writing_training_data_prepared", | |
| split="train", | |
| cache_dir=cache_dir | |
| ) | |
| all_data.append(("vietanh", parse_vietanh_dataset(ds_vietanh))) | |
| except Exception as e: | |
| print(f"✗ Lỗi tải vietanh: {e}\n") | |
| # Tính tổng | |
| print("\n" + "="*60) | |
| print("--- TỔNG HỢP ---") | |
| print("="*60) | |
| total = 0 | |
| for name, data in all_data: | |
| count = len(data) | |
| total += count | |
| print(f"Dataset ({name:15}): {count:5d} mẫu") | |
| print("="*60) | |
| print(f"Tổng cộng mẫu hợp lệ: {total}") | |
| print("="*60) | |
| final_dataset = [] | |
| for name, data in all_data: | |
| final_dataset.extend(data) | |
| if not final_dataset: | |
| print("✗ Lỗi: Không có dữ liệu nào được chuẩn hóa. Vui lòng kiểm tra lại script.") | |
| return | |
| output_dir = "data" | |
| output_path = os.path.join(output_dir, "dataset_for_scorer.json") | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| print(f"✓ Đã tạo thư mục {output_dir}") | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(final_dataset, f, ensure_ascii=False, indent=2) | |
| print(f"✓ Đã ghi {len(final_dataset)} mẫu vào file '{output_path}'.") | |
| print("\n✓ Hoàn tất! Bây giờ bạn có thể chạy 'src/train.py' trên Colab!") | |
| if __name__ == "__main__": | |
| main() |