ielts-grader-api / src /clean_external_data.py
diminch's picture
Deploy V15 Clean (Removed binary files history)
d939bae
import json
import re
import os
from datasets import load_dataset
from tqdm import tqdm
# Regex để bắt điểm (ví dụ: 7 hoặc 7.5 hoặc 6.0)
FLOAT_RE = r"(\d+(?:\.\d+)?)"
def to_float_safe(x):
"""Chuyển đổi an toàn sang float, nếu lỗi trả về None"""
try:
val = float(x)
# Kiểm tra điểm hợp lệ (0-9)
if 0 <= val <= 9:
return val
return None
except Exception:
return None
def parse_chillies_dataset(dataset):
"""
Parser cho 'chillies/IELTS-writing-task-2-evaluation'.
Format: **Task Achievement: [7]** hoặc **Overall Band Score: [7.5]**
"""
print("Đang xử lý dataset 'chillies'...")
cleaned = []
bad_examples = 0
patterns = {
"task_response": re.compile(
r"\*\*Task Achievement:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
re.I
),
"coherence_cohesion": re.compile(
r"\*\*Coherence and Cohesion:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
re.I
),
"lexical_resource": re.compile(
r"\*\*Lexical Resource:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
re.I
),
"grammatical_range": re.compile(
r"\*\*Grammatical Range and Accuracy:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
re.I
),
}
for item in tqdm(dataset, desc="Parsing chillies"):
try:
prompt = item.get('prompt', '').strip()
essay = item.get('essay', '').strip()
evaluation_text = item.get('evaluation', '')
if not (prompt and essay and evaluation_text and len(essay) > 50):
bad_examples += 1
continue
scores = {}
for key, pattern in patterns.items():
match = pattern.search(evaluation_text)
if match:
score_str = match.group(1)
scores[key] = to_float_safe(score_str)
else:
scores[key] = None
if all(scores.values()):
standard_scores = {
"task_response": scores["task_response"],
"coherence_cohesion": scores["coherence_cohesion"],
"lexical_resource": scores["lexical_resource"],
"grammatical_range": scores["grammatical_range"]
}
cleaned.append({
"prompt_text": prompt,
"essay_text": essay,
"scores": standard_scores
})
else:
bad_examples += 1
except Exception:
bad_examples += 1
print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
return cleaned
def parse_123harr_dataset(dataset):
"""
Parser cho '123Harr/IELTS-WT2-LLaMa3-1k'.
Lấy scores từ 'formatted' field
"""
print("Đang xử lý dataset '123Harr'...")
cleaned = []
bad_examples = 0
prompt_essay_re = re.compile(
r"<\|start_header_id\|>user<\|end_header_id\|>\n\n(.*?)<\|eot_id\|>",
re.S
)
score_patterns = {
"task_response": re.compile(
r"(?:###|##|\*\*)?Task Achievement(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
re.I | re.M
),
"coherence_cohesion": re.compile(
r"(?:###|##|\*\*)?Coherence and Cohesion(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
re.I | re.M
),
"lexical_resource": re.compile(
r"(?:###|##|\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
re.I | re.M
),
"grammatical_range": re.compile(
r"(?:###|##|\*\*)?Grammatical Range and Accuracy(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
re.I | re.M
),
}
for item in tqdm(dataset, desc="Parsing 123Harr"):
try:
formatted_text = item.get('formatted', '')
if not formatted_text:
bad_examples += 1
continue
matches = prompt_essay_re.findall(formatted_text)
if len(matches) < 2:
bad_examples += 1
continue
prompt = matches[0].strip()
essay = matches[1].strip()
if not prompt or not essay or len(essay) < 50:
bad_examples += 1
continue
scores = {}
for key, pattern in score_patterns.items():
match = pattern.search(formatted_text)
if match:
score_str = match.group(match.lastindex) if match.lastindex else match.group(1)
scores[key] = to_float_safe(score_str)
else:
scores[key] = None
if all(scores.values()):
standard_scores = {
"task_response": scores["task_response"],
"coherence_cohesion": scores["coherence_cohesion"],
"lexical_resource": scores["lexical_resource"],
"grammatical_range": scores["grammatical_range"]
}
cleaned.append({
"prompt_text": prompt,
"essay_text": essay,
"scores": standard_scores
})
else:
bad_examples += 1
except Exception:
bad_examples += 1
print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
return cleaned
def parse_dpo_dataset(dataset):
"""
Parser cho 'chillies/DPO_ielts_writing'.
"""
print("Đang xử lý dataset 'DPO'...")
cleaned = []
bad_examples = 0
patterns_primary = {
"task_response": re.compile(
r"##\s*Task Achievement:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
re.I
),
"coherence_cohesion": re.compile(
r"##\s*Coherence and Cohesion:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
re.I
),
"lexical_resource": re.compile(
r"##\s*Lexical Resource(?:\s*\(Vocabulary\))?:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
re.I
),
"grammatical_range": re.compile(
r"##\s*Grammatical Range and Accuracy:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
re.I
),
}
patterns_fallback = {
"task_response": re.compile(r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*" + FLOAT_RE, re.I),
"coherence_cohesion": re.compile(r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*" + FLOAT_RE, re.I),
"lexical_resource": re.compile(r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*" + FLOAT_RE, re.I),
"grammatical_range": re.compile(r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*" + FLOAT_RE, re.I),
}
for item in tqdm(dataset, desc="Parsing DPO"):
try:
prompt = item.get('prompt', '').strip()
essay = item.get('essay', '').strip()
chosen_text = item.get('chosen', '')
if not (prompt and essay and chosen_text and len(essay) > 50):
bad_examples += 1
continue
scores = {}
for key, pattern in patterns_primary.items():
match = pattern.search(chosen_text)
if match:
scores[key] = to_float_safe(match.group(1))
else:
scores[key] = None
if not all(scores.values()):
scores = {}
for key, pattern in patterns_fallback.items():
match = pattern.search(chosen_text)
if match:
scores[key] = to_float_safe(match.group(1))
else:
scores[key] = None
if all(scores.values()):
standard_scores = {
"task_response": scores["task_response"],
"coherence_cohesion": scores["coherence_cohesion"],
"lexical_resource": scores["lexical_resource"],
"grammatical_range": scores["grammatical_range"]
}
cleaned.append({
"prompt_text": prompt,
"essay_text": essay,
"scores": standard_scores
})
else:
bad_examples += 1
except Exception:
bad_examples += 1
print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
return cleaned
def parse_hadeel_dataset(dataset):
"""
Parser cho 'hadeelbkh/tokenized-IELTS-writing-task-2-evaluation'.
"""
print("Đang xử lý dataset 'hadeel'...")
cleaned = []
bad_examples = 0
patterns = {
"task_response": re.compile(
r"(?:\*\*)?task achievement(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
re.I
),
"coherence_cohesion": re.compile(
r"(?:\*\*)?coherence and cohesion(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
re.I
),
"lexical_resource": re.compile(
r"(?:\*\*)?lexical resource(?:\s*\(vocabulary\))?(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
re.I
),
"grammatical_range": re.compile(
r"(?:\*\*)?grammatical range and accuracy(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
re.I
),
}
for item in tqdm(dataset, desc="Parsing hadeel"):
try:
prompt = item.get('prompt', '').strip()
essay = item.get('essay', '').strip()
evaluation_text = item.get('evaluation', '')
if not (prompt and essay and evaluation_text and len(essay) > 50):
bad_examples += 1
continue
scores = {}
for key, pattern in patterns.items():
match = pattern.search(evaluation_text)
if match:
score_str = match.group(1)
scores[key] = to_float_safe(score_str)
else:
scores[key] = None
if all(scores.values()):
standard_scores = {
"task_response": scores["task_response"],
"coherence_cohesion": scores["coherence_cohesion"],
"lexical_resource": scores["lexical_resource"],
"grammatical_range": scores["grammatical_range"]
}
cleaned.append({
"prompt_text": prompt,
"essay_text": essay,
"scores": standard_scores
})
else:
bad_examples += 1
except Exception:
bad_examples += 1
print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
return cleaned
def parse_vietanh_dataset(dataset):
"""
Parser cho 'vietanh0802/ielts_writing_training_data_prepared'.
Format: <s>[INST] ... ### Prompt: ... ### Essay: ... [/INST] ...
"""
print("Đang xử lý dataset 'vietanh'...")
cleaned = []
bad_examples = 0
prompt_re = re.compile(r"### Prompt:\s*(.*?)(?=### Essay:|$)", re.S | re.I)
essay_re = re.compile(r"### Essay:\s*(.*?)(?=\[/INST\]|$)", re.S | re.I)
score_patterns = {
"task_response": re.compile(
r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
re.I
),
"coherence_cohesion": re.compile(
r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
re.I
),
"lexical_resource": re.compile(
r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
re.I
),
"grammatical_range": re.compile(
r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
re.I
),
}
for item in tqdm(dataset, desc="Parsing vietanh"):
try:
training_text = item.get('training_text', '')
if not training_text:
bad_examples += 1
continue
prompt_match = prompt_re.search(training_text)
if not prompt_match:
bad_examples += 1
continue
prompt = prompt_match.group(1).strip()
essay_match = essay_re.search(training_text)
if not essay_match:
bad_examples += 1
continue
essay = essay_match.group(1).strip()
if not prompt or not essay or len(essay) < 50:
bad_examples += 1
continue
scores = {}
for key, pattern in score_patterns.items():
match = pattern.search(training_text)
if match:
scores[key] = to_float_safe(match.group(1))
else:
scores[key] = None
if all(scores.values()):
standard_scores = {
"task_response": scores["task_response"],
"coherence_cohesion": scores["coherence_cohesion"],
"lexical_resource": scores["lexical_resource"],
"grammatical_range": scores["grammatical_range"]
}
cleaned.append({
"prompt_text": prompt,
"essay_text": essay,
"scores": standard_scores
})
else:
bad_examples += 1
except Exception:
bad_examples += 1
print(f" ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
return cleaned
def main():
print("Đang tải các dataset từ Hugging Face...\n")
cache_dir = "./.cache/huggingface_datasets"
all_data = []
# Dataset 1: chillies/IELTS-writing-task-2-evaluation
try:
ds_chillies = load_dataset(
"chillies/IELTS-writing-task-2-evaluation",
split="train",
cache_dir=cache_dir
)
all_data.append(("chillies", parse_chillies_dataset(ds_chillies)))
except Exception as e:
print(f"✗ Lỗi tải chillies: {e}\n")
# Dataset 2: 123Harr/IELTS-WT2-LLaMa3-1k
try:
ds_123harr = load_dataset(
"123Harr/IELTS-WT2-LLaMa3-1k",
split="train",
cache_dir=cache_dir
)
all_data.append(("123Harr", parse_123harr_dataset(ds_123harr)))
except Exception as e:
print(f"✗ Lỗi tải 123Harr: {e}\n")
# Dataset 3: chillies/DPO_ielts_writing
try:
ds_chillies_2 = load_dataset(
"chillies/DPO_ielts_writing",
split="train",
cache_dir=cache_dir
)
all_data.append(("DPO", parse_dpo_dataset(ds_chillies_2)))
except Exception as e:
print(f"✗ Lỗi tải DPO: {e}\n")
# Dataset 4: hadeelbkh/tokenized-IELTS-writing-task-2-evaluation
try:
ds_hadeel = load_dataset(
"hadeelbkh/tokenized-IELTS-writing-task-2-evaluation-DialoGPT-medium",
split="train",
cache_dir=cache_dir
)
all_data.append(("hadeel", parse_hadeel_dataset(ds_hadeel)))
except Exception as e:
print(f"✗ Lỗi tải hadeel: {e}\n")
# Dataset 5: vietanh0802/ielts_writing_training_data_prepared
try:
ds_vietanh = load_dataset(
"vietanh0802/ielts_writing_training_data_prepared",
split="train",
cache_dir=cache_dir
)
all_data.append(("vietanh", parse_vietanh_dataset(ds_vietanh)))
except Exception as e:
print(f"✗ Lỗi tải vietanh: {e}\n")
# Tính tổng
print("\n" + "="*60)
print("--- TỔNG HỢP ---")
print("="*60)
total = 0
for name, data in all_data:
count = len(data)
total += count
print(f"Dataset ({name:15}): {count:5d} mẫu")
print("="*60)
print(f"Tổng cộng mẫu hợp lệ: {total}")
print("="*60)
final_dataset = []
for name, data in all_data:
final_dataset.extend(data)
if not final_dataset:
print("✗ Lỗi: Không có dữ liệu nào được chuẩn hóa. Vui lòng kiểm tra lại script.")
return
output_dir = "data"
output_path = os.path.join(output_dir, "dataset_for_scorer.json")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"✓ Đã tạo thư mục {output_dir}")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(final_dataset, f, ensure_ascii=False, indent=2)
print(f"✓ Đã ghi {len(final_dataset)} mẫu vào file '{output_path}'.")
print("\n✓ Hoàn tất! Bây giờ bạn có thể chạy 'src/train.py' trên Colab!")
if __name__ == "__main__":
main()