import json import re import os from datasets import load_dataset from tqdm import tqdm # Regex để bắt điểm (ví dụ: 7 hoặc 7.5 hoặc 6.0) FLOAT_RE = r"(\d+(?:\.\d+)?)" def to_float_safe(x): """Chuyển đổi an toàn sang float, nếu lỗi trả về None""" try: val = float(x) # Kiểm tra điểm hợp lệ (0-9) if 0 <= val <= 9: return val return None except Exception: return None def parse_chillies_dataset(dataset): """ Parser cho 'chillies/IELTS-writing-task-2-evaluation'. Format: **Task Achievement: [7]** hoặc **Overall Band Score: [7.5]** """ print("Đang xử lý dataset 'chillies'...") cleaned = [] bad_examples = 0 patterns = { "task_response": re.compile( r"\*\*Task Achievement:\s*\[?(" + FLOAT_RE + r")\]?\*\*", re.I ), "coherence_cohesion": re.compile( r"\*\*Coherence and Cohesion:\s*\[?(" + FLOAT_RE + r")\]?\*\*", re.I ), "lexical_resource": re.compile( r"\*\*Lexical Resource:\s*\[?(" + FLOAT_RE + r")\]?\*\*", re.I ), "grammatical_range": re.compile( r"\*\*Grammatical Range and Accuracy:\s*\[?(" + FLOAT_RE + r")\]?\*\*", re.I ), } for item in tqdm(dataset, desc="Parsing chillies"): try: prompt = item.get('prompt', '').strip() essay = item.get('essay', '').strip() evaluation_text = item.get('evaluation', '') if not (prompt and essay and evaluation_text and len(essay) > 50): bad_examples += 1 continue scores = {} for key, pattern in patterns.items(): match = pattern.search(evaluation_text) if match: score_str = match.group(1) scores[key] = to_float_safe(score_str) else: scores[key] = None if all(scores.values()): standard_scores = { "task_response": scores["task_response"], "coherence_cohesion": scores["coherence_cohesion"], "lexical_resource": scores["lexical_resource"], "grammatical_range": scores["grammatical_range"] } cleaned.append({ "prompt_text": prompt, "essay_text": essay, "scores": standard_scores }) else: bad_examples += 1 except Exception: bad_examples += 1 print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") return cleaned def parse_123harr_dataset(dataset): """ Parser cho '123Harr/IELTS-WT2-LLaMa3-1k'. Lấy scores từ 'formatted' field """ print("Đang xử lý dataset '123Harr'...") cleaned = [] bad_examples = 0 prompt_essay_re = re.compile( r"<\|start_header_id\|>user<\|end_header_id\|>\n\n(.*?)<\|eot_id\|>", re.S ) score_patterns = { "task_response": re.compile( r"(?:###|##|\*\*)?Task Achievement(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", re.I | re.M ), "coherence_cohesion": re.compile( r"(?:###|##|\*\*)?Coherence and Cohesion(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", re.I | re.M ), "lexical_resource": re.compile( r"(?:###|##|\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", re.I | re.M ), "grammatical_range": re.compile( r"(?:###|##|\*\*)?Grammatical Range and Accuracy(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)", re.I | re.M ), } for item in tqdm(dataset, desc="Parsing 123Harr"): try: formatted_text = item.get('formatted', '') if not formatted_text: bad_examples += 1 continue matches = prompt_essay_re.findall(formatted_text) if len(matches) < 2: bad_examples += 1 continue prompt = matches[0].strip() essay = matches[1].strip() if not prompt or not essay or len(essay) < 50: bad_examples += 1 continue scores = {} for key, pattern in score_patterns.items(): match = pattern.search(formatted_text) if match: score_str = match.group(match.lastindex) if match.lastindex else match.group(1) scores[key] = to_float_safe(score_str) else: scores[key] = None if all(scores.values()): standard_scores = { "task_response": scores["task_response"], "coherence_cohesion": scores["coherence_cohesion"], "lexical_resource": scores["lexical_resource"], "grammatical_range": scores["grammatical_range"] } cleaned.append({ "prompt_text": prompt, "essay_text": essay, "scores": standard_scores }) else: bad_examples += 1 except Exception: bad_examples += 1 print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") return cleaned def parse_dpo_dataset(dataset): """ Parser cho 'chillies/DPO_ielts_writing'. """ print("Đang xử lý dataset 'DPO'...") cleaned = [] bad_examples = 0 patterns_primary = { "task_response": re.compile( r"##\s*Task Achievement:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, re.I ), "coherence_cohesion": re.compile( r"##\s*Coherence and Cohesion:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, re.I ), "lexical_resource": re.compile( r"##\s*Lexical Resource(?:\s*\(Vocabulary\))?:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, re.I ), "grammatical_range": re.compile( r"##\s*Grammatical Range and Accuracy:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE, re.I ), } patterns_fallback = { "task_response": re.compile(r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*" + FLOAT_RE, re.I), "coherence_cohesion": re.compile(r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*" + FLOAT_RE, re.I), "lexical_resource": re.compile(r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*" + FLOAT_RE, re.I), "grammatical_range": re.compile(r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*" + FLOAT_RE, re.I), } for item in tqdm(dataset, desc="Parsing DPO"): try: prompt = item.get('prompt', '').strip() essay = item.get('essay', '').strip() chosen_text = item.get('chosen', '') if not (prompt and essay and chosen_text and len(essay) > 50): bad_examples += 1 continue scores = {} for key, pattern in patterns_primary.items(): match = pattern.search(chosen_text) if match: scores[key] = to_float_safe(match.group(1)) else: scores[key] = None if not all(scores.values()): scores = {} for key, pattern in patterns_fallback.items(): match = pattern.search(chosen_text) if match: scores[key] = to_float_safe(match.group(1)) else: scores[key] = None if all(scores.values()): standard_scores = { "task_response": scores["task_response"], "coherence_cohesion": scores["coherence_cohesion"], "lexical_resource": scores["lexical_resource"], "grammatical_range": scores["grammatical_range"] } cleaned.append({ "prompt_text": prompt, "essay_text": essay, "scores": standard_scores }) else: bad_examples += 1 except Exception: bad_examples += 1 print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") return cleaned def parse_hadeel_dataset(dataset): """ Parser cho 'hadeelbkh/tokenized-IELTS-writing-task-2-evaluation'. """ print("Đang xử lý dataset 'hadeel'...") cleaned = [] bad_examples = 0 patterns = { "task_response": re.compile( r"(?:\*\*)?task achievement(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", re.I ), "coherence_cohesion": re.compile( r"(?:\*\*)?coherence and cohesion(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", re.I ), "lexical_resource": re.compile( r"(?:\*\*)?lexical resource(?:\s*\(vocabulary\))?(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", re.I ), "grammatical_range": re.compile( r"(?:\*\*)?grammatical range and accuracy(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")", re.I ), } for item in tqdm(dataset, desc="Parsing hadeel"): try: prompt = item.get('prompt', '').strip() essay = item.get('essay', '').strip() evaluation_text = item.get('evaluation', '') if not (prompt and essay and evaluation_text and len(essay) > 50): bad_examples += 1 continue scores = {} for key, pattern in patterns.items(): match = pattern.search(evaluation_text) if match: score_str = match.group(1) scores[key] = to_float_safe(score_str) else: scores[key] = None if all(scores.values()): standard_scores = { "task_response": scores["task_response"], "coherence_cohesion": scores["coherence_cohesion"], "lexical_resource": scores["lexical_resource"], "grammatical_range": scores["grammatical_range"] } cleaned.append({ "prompt_text": prompt, "essay_text": essay, "scores": standard_scores }) else: bad_examples += 1 except Exception: bad_examples += 1 print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") return cleaned def parse_vietanh_dataset(dataset): """ Parser cho 'vietanh0802/ielts_writing_training_data_prepared'. Format: [INST] ... ### Prompt: ... ### Essay: ... [/INST] ... """ print("Đang xử lý dataset 'vietanh'...") cleaned = [] bad_examples = 0 prompt_re = re.compile(r"### Prompt:\s*(.*?)(?=### Essay:|$)", re.S | re.I) essay_re = re.compile(r"### Essay:\s*(.*?)(?=\[/INST\]|$)", re.S | re.I) score_patterns = { "task_response": re.compile( r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", re.I ), "coherence_cohesion": re.compile( r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", re.I ), "lexical_resource": re.compile( r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", re.I ), "grammatical_range": re.compile( r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?", re.I ), } for item in tqdm(dataset, desc="Parsing vietanh"): try: training_text = item.get('training_text', '') if not training_text: bad_examples += 1 continue prompt_match = prompt_re.search(training_text) if not prompt_match: bad_examples += 1 continue prompt = prompt_match.group(1).strip() essay_match = essay_re.search(training_text) if not essay_match: bad_examples += 1 continue essay = essay_match.group(1).strip() if not prompt or not essay or len(essay) < 50: bad_examples += 1 continue scores = {} for key, pattern in score_patterns.items(): match = pattern.search(training_text) if match: scores[key] = to_float_safe(match.group(1)) else: scores[key] = None if all(scores.values()): standard_scores = { "task_response": scores["task_response"], "coherence_cohesion": scores["coherence_cohesion"], "lexical_resource": scores["lexical_resource"], "grammatical_range": scores["grammatical_range"] } cleaned.append({ "prompt_text": prompt, "essay_text": essay, "scores": standard_scores }) else: bad_examples += 1 except Exception: bad_examples += 1 print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}") return cleaned def main(): print("Đang tải các dataset từ Hugging Face...\n") cache_dir = "./.cache/huggingface_datasets" all_data = [] # Dataset 1: chillies/IELTS-writing-task-2-evaluation try: ds_chillies = load_dataset( "chillies/IELTS-writing-task-2-evaluation", split="train", cache_dir=cache_dir ) all_data.append(("chillies", parse_chillies_dataset(ds_chillies))) except Exception as e: print(f"✗ Lỗi tải chillies: {e}\n") # Dataset 2: 123Harr/IELTS-WT2-LLaMa3-1k try: ds_123harr = load_dataset( "123Harr/IELTS-WT2-LLaMa3-1k", split="train", cache_dir=cache_dir ) all_data.append(("123Harr", parse_123harr_dataset(ds_123harr))) except Exception as e: print(f"✗ Lỗi tải 123Harr: {e}\n") # Dataset 3: chillies/DPO_ielts_writing try: ds_chillies_2 = load_dataset( "chillies/DPO_ielts_writing", split="train", cache_dir=cache_dir ) all_data.append(("DPO", parse_dpo_dataset(ds_chillies_2))) except Exception as e: print(f"✗ Lỗi tải DPO: {e}\n") # Dataset 4: hadeelbkh/tokenized-IELTS-writing-task-2-evaluation try: ds_hadeel = load_dataset( "hadeelbkh/tokenized-IELTS-writing-task-2-evaluation-DialoGPT-medium", split="train", cache_dir=cache_dir ) all_data.append(("hadeel", parse_hadeel_dataset(ds_hadeel))) except Exception as e: print(f"✗ Lỗi tải hadeel: {e}\n") # Dataset 5: vietanh0802/ielts_writing_training_data_prepared try: ds_vietanh = load_dataset( "vietanh0802/ielts_writing_training_data_prepared", split="train", cache_dir=cache_dir ) all_data.append(("vietanh", parse_vietanh_dataset(ds_vietanh))) except Exception as e: print(f"✗ Lỗi tải vietanh: {e}\n") # Tính tổng print("\n" + "="*60) print("--- TỔNG HỢP ---") print("="*60) total = 0 for name, data in all_data: count = len(data) total += count print(f"Dataset ({name:15}): {count:5d} mẫu") print("="*60) print(f"Tổng cộng mẫu hợp lệ: {total}") print("="*60) final_dataset = [] for name, data in all_data: final_dataset.extend(data) if not final_dataset: print("✗ Lỗi: Không có dữ liệu nào được chuẩn hóa. Vui lòng kiểm tra lại script.") return output_dir = "data" output_path = os.path.join(output_dir, "dataset_for_scorer.json") if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"✓ Đã tạo thư mục {output_dir}") with open(output_path, "w", encoding="utf-8") as f: json.dump(final_dataset, f, ensure_ascii=False, indent=2) print(f"✓ Đã ghi {len(final_dataset)} mẫu vào file '{output_path}'.") print("\n✓ Hoàn tất! Bây giờ bạn có thể chạy 'src/train.py' trên Colab!") if __name__ == "__main__": main()