{ "bpe": { "v8000_mf2": { "out_dir": "results\\bpe\\v8000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 96.0113, "avg_processing_time_ms": 0.19588143825531007, "compression_ratio": 96.0113, "total_tokens_evaluated": 1920226, "unk_count": 0, "train_time_s": 105.87230825424194, "config": { "vocab_size": 8000, "min_frequency": 2, "continuing_subword_prefix": "##" } } }, "v8000_mf5": { "out_dir": "results\\bpe\\v8000_mf5", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 96.0113, "avg_processing_time_ms": 0.19508297443389894, "compression_ratio": 96.0113, "total_tokens_evaluated": 1920226, "unk_count": 0, "train_time_s": 115.85335993766785, "config": { "vocab_size": 8000, "min_frequency": 5, "continuing_subword_prefix": "##" } } }, "v16000_mf2": { "out_dir": "results\\bpe\\v16000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 84.86375, "avg_processing_time_ms": 0.1791509985923767, "compression_ratio": 84.86375, "total_tokens_evaluated": 1697275, "unk_count": 0, "train_time_s": 122.03794264793396, "config": { "vocab_size": 16000, "min_frequency": 2, "continuing_subword_prefix": "##" } } }, "v16000_mf5": { "out_dir": "results\\bpe\\v16000_mf5", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 84.86375, "avg_processing_time_ms": 0.1843635559082031, "compression_ratio": 84.86375, "total_tokens_evaluated": 1697275, "unk_count": 0, "train_time_s": 119.14113140106201, "config": { "vocab_size": 16000, "min_frequency": 5, "continuing_subword_prefix": "##" } } }, "v32000_mf2": { "out_dir": "results\\bpe\\v32000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 77.17065, "avg_processing_time_ms": 0.18579285144805907, "compression_ratio": 77.17065, "total_tokens_evaluated": 1543413, "unk_count": 0, "train_time_s": 122.94540190696716, "config": { "vocab_size": 32000, "min_frequency": 2, "continuing_subword_prefix": "##" } } }, "v32000_mf5": { "out_dir": "results\\bpe\\v32000_mf5", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 77.17065, "avg_processing_time_ms": 0.1811486840248108, "compression_ratio": 77.17065, "total_tokens_evaluated": 1543413, "unk_count": 0, "train_time_s": 122.62627506256104, "config": { "vocab_size": 32000, "min_frequency": 5, "continuing_subword_prefix": "##" } } } }, "wordpiece": { "v8000_mf1": { "out_dir": "results\\wordpiece\\v8000_mf1", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 95.39795, "avg_processing_time_ms": 31.364226222038273, "compression_ratio": 95.39795, "total_tokens_evaluated": 1907959, "unk_count": 0, "train_time_s": 124.3489019870758, "config": { "vocab_size": 8000, "min_frequency": 1 } } }, "v8000_mf2": { "out_dir": "results\\wordpiece\\v8000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 95.39795, "avg_processing_time_ms": 0.22379395961761475, "compression_ratio": 95.39795, "total_tokens_evaluated": 1907959, "unk_count": 0, "train_time_s": 176.4660017490387, "config": { "vocab_size": 8000, "min_frequency": 2 } } }, "v16000_mf1": { "out_dir": "results\\wordpiece\\v16000_mf1", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 84.55695, "avg_processing_time_ms": 0.2237707018852234, "compression_ratio": 84.55695, "total_tokens_evaluated": 1691139, "unk_count": 0, "train_time_s": 184.54623937606812, "config": { "vocab_size": 16000, "min_frequency": 1 } } }, "v16000_mf2": { "out_dir": "results\\wordpiece\\v16000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 84.55695, "avg_processing_time_ms": 0.2417303204536438, "compression_ratio": 84.55695, "total_tokens_evaluated": 1691139, "unk_count": 0, "train_time_s": 318.9338138103485, "config": { "vocab_size": 16000, "min_frequency": 2 } } }, "v32000_mf1": { "out_dir": "results\\wordpiece\\v32000_mf1", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 76.92375, "avg_processing_time_ms": 0.2857889056205749, "compression_ratio": 76.92375, "total_tokens_evaluated": 1538475, "unk_count": 0, "train_time_s": 158.26075053215027, "config": { "vocab_size": 32000, "min_frequency": 1 } } }, "v32000_mf2": { "out_dir": "results\\wordpiece\\v32000_mf2", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 76.92375, "avg_processing_time_ms": 0.518797504901886, "compression_ratio": 76.92375, "total_tokens_evaluated": 1538475, "unk_count": 0, "train_time_s": 157.1074833869934, "config": { "vocab_size": 32000, "min_frequency": 2 } } } }, "unigram": { "v8000": { "out_dir": "results\\unigram\\v8000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 101.5805, "avg_processing_time_ms": 0.3227068305015564, "compression_ratio": 101.5805, "total_tokens_evaluated": 2031610, "unk_count": 0, "train_time_s": 601.7949032783508, "config": { "vocab_size": 8000 } } }, "v16000": { "out_dir": "results\\unigram\\v16000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 90.8909, "avg_processing_time_ms": 0.29166127443313594, "compression_ratio": 90.8909, "total_tokens_evaluated": 1817818, "unk_count": 0, "train_time_s": 614.1360929012299, "config": { "vocab_size": 16000 } } }, "v32000": { "out_dir": "results\\unigram\\v32000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 83.3668, "avg_processing_time_ms": 0.32854799032211307, "compression_ratio": 83.3668, "total_tokens_evaluated": 1667336, "unk_count": 0, "train_time_s": 757.2155563831329, "config": { "vocab_size": 32000 } } } }, "spm": { "v8000": { "out_dir": "results\\spm_unigram\\v8000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 107.90535, "avg_processing_time_ms": 0.11760829687118529, "compression_ratio": 107.90535, "total_tokens_evaluated": 2158107, "unk_count": 0, "unk_piece_used": "[UNK]", "train_time_s": 343.80153012275696, "config": { "vocab_size": 8000 } } }, "v16000": { "out_dir": "results\\spm_unigram\\v16000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 95.67175, "avg_processing_time_ms": 0.160364830493927, "compression_ratio": 95.67175, "total_tokens_evaluated": 1913435, "unk_count": 0, "unk_piece_used": "[UNK]", "train_time_s": 477.8609836101532, "config": { "vocab_size": 16000 } } }, "v32000": { "out_dir": "results\\spm_unigram\\v32000", "metrics": { "oov_rate": 0.0, "avg_sequence_length": 86.6945, "avg_processing_time_ms": 0.1026016116142273, "compression_ratio": 86.6945, "total_tokens_evaluated": 1733890, "unk_count": 0, "unk_piece_used": "[UNK]", "train_time_s": 249.83488726615906, "config": { "vocab_size": 32000 } } } }, "metadata": { "corpus_path": "full_tatar_raw_corpus_clean.txt", "vocab_sizes": [ 8000, 16000, 32000 ], "sample_size": 20000, "seed": 42, "selected_models": [ "bpe", "wordpiece", "unigram", "spm" ], "timestamp": "2025-11-19 21:10:06" } }