| import re | |
| def clean_text(text: str) -> str: | |
| text = text.replace('\r','\n') | |
| text = re.sub(r"\n{2,}", '\n\n', text) | |
| return text.strip() | |
| def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50): | |
| words = text.split() | |
| chunks = [] | |
| i = 0 | |
| while i < len(words): | |
| chunk = words[i:i+chunk_size] | |
| chunks.append(' '.join(chunk)) | |
| i += chunk_size - overlap | |
| return chunks | |