{ "epoch": 9, "global_step": 994980, "tokens_processed": 996862184, "target_tokens": 100000000, "best_similarity": 0.4551965296268463, "training_config": { "model": { "vocab_size": 50257, "text_encoder_dim": 128, "text_encoder_layers": 4, "text_encoder_heads": 4, "text_decoder_dim": 128, "text_decoder_layers": 4, "text_decoder_heads": 4, "vision_encoder_dim": 768, "vision_latent_size": 128, "vision_hidden_size": 64, "vision_compression_method": "learned_compression", "vision_spatial_pooling": true, "vision_pool_size": 2, "fusion_hidden_size": 128, "fusion_num_heads": 4, "fusion_num_layers": 2, "memory_size": 32, "episode_dim": 128, "memory_alpha": 0.2, "direct_writing": true, "memory_compression": true, "enable_adaptive_training": true, "max_seq_len": 256, "dropout": 0.15 }, "token_constraints": { "total_tokens": 100000000, "caption_tokens": 50000000, "text_tokens": 50000000, "enforce_exact_count": true, "uniform_sampling": true, "alignment_priority": "perfect_alignment", "preserve_image_caption_pairs": true, "strict_alignment_validation": true }, "vision_feature_reduction": { "enabled": true, "method": "learned_compression", "target_dim": 64, "spatial_pooling": true, "pool_method": "attention", "hidden_dim": 128, "learnable": true, "preserve_spatial_info": true }, "data": { "dataset_dir": "../babylm_dataset", "text_encoder_name": "gpt2", "max_seq_length": 256, "count_tokens": true, "target_caption_tokens": 50000000, "target_text_tokens": 50000000, "token_counting_method": "gpt2", "batch_size": 64, "num_workers": 6, "pin_memory": true, "persistent_workers": true, "mix_ratio": 0.5, "shuffle_datasets": true, "ensure_alignment": true, "validate_alignment": true, "alignment_verification": "strict", "never_break_pairs": true, "alignment_check_frequency": 1000, "use_validation": false, "train_only": true }, "attention_analysis": { "track_top_k": 5, "log_every_n_steps": 200, "viz_every_n_epochs": 3, "save_head_patterns": true, "analyze_memory_attention": true, "analyze_cross_modal": true, "track_token_alignment": true }, "adaptive_training": { "enabled": true, "similarity_window_size": 200, "drop_threshold": 0.12, "min_steps_between_interventions": 800, "freeze_duration_steps": 1500, "loss_rebalance_factor": 2.0, "similarity_smoothing_alpha": 0.15 }, "training": { "max_epochs": 10, "accumulate_grad_batches": 2, "gradient_clip_val": 0.3, "val_check_interval": 1000, "scheduler": "cosine_with_restarts", "min_lr": 5e-05, "warmup_steps": 1000, "learning_rate": 0.0002, "weight_decay": 0.02, "optimizer": "adamw8bit", "scheduler_config": { "T_0": 1000, "T_mult": 2, "eta_min_ratio": 0.1 }, "cross_modal_loss_weight": 1.5, "text_generation_loss_weight": 1.0, "memory_regularization_weight": 0.1, "alignment_consistency_weight": 0.5, "track_token_usage": true, "log_token_progress": true, "stop_at_token_limit": false, "validate_alignment_every_n_steps": 500, "log_alignment_metrics": true, "alignment_loss_scaling": "adaptive" }, "wandb": { "project": "bitmar-100M-attention-epochs", "entity": "babylm-ntust", "api_key": null, "log_every_n_steps": 100, "log_attention": true, "log_memory": true, "log_gradients": true, "log_token_usage": true, "log_cross_modal_similarity": true, "log_alignment_quality": true, "log_caption_image_matching": true, "save_code": true, "create_plots": true, "plot_attention_heatmaps": true, "plot_memory_usage": true, "plot_token_distribution": true, "plot_alignment_metrics": true, "log_memory_evolution": true, "plot_memory_evolution_heatmap": true, "plot_memory_diversity": true, "plot_memory_access_patterns": true, "memory_visualization_frequency": 5000, "memory_snapshot_frequency": 10000, "track_memory_metrics": [ "memory_diversity_score", "memory_specialization_score", "memory_usage_entropy", "cross_modal_memory_ratio", "memory_slot_utilization", "memory_update_frequency", "memory_retrieval_accuracy" ] }, "evaluation": { "metrics": [ "bleu", "rouge", "cross_modal_similarity", "memory_efficiency" ], "generate_samples": true, "num_samples": 20, "max_generation_length": 32, "temperature": 0.8, "top_p": 0.9, "evaluate_alignment": true, "alignment_metrics": [ "cosine_similarity", "retrieval_accuracy", "caption_image_matching", "cross_modal_retrieval" ], "alignment_threshold": 0.8, "validate_pairs_during_eval": true }, "output": { "checkpoint_dir": "checkpoints_100M_dataset", "log_dir": "logs_100M_dataset", "attention_dir": "attention_100M_dataset", "memory_dir": "memory_100M_dataset", "results_dir": "results_100M_dataset", "token_logs_dir": "token_logs_100M_dataset" }, "memory_optimization": { "use_gradient_checkpointing": true, "use_fp16": true, "use_int8_vision": false, "empty_cache_frequency": 10, "max_memory_slots_in_ram": 16, "compress_episodic_memory": true, "vision_feature_caching": false, "vision_batch_processing": true, "tie_word_embeddings": true, "use_shared_attention": false }, "performance_targets": { "max_model_size_mb": 50, "target_cross_modal_similarity": 0.75, "target_text_generation_quality": 0.6, "memory_efficiency_threshold": 0.8 }, "flops_tracking": { "enabled": true, "log_frequency": 100, "save_statistics": true, "estimate_theoretical": true, "track_peak_performance": true, "log_to_wandb": true, "detailed_breakdown": true, "memory_bandwidth_tracking": false, "efficiency_analysis": true, "track_components": [ "attention", "feedforward", "layer_norm", "embeddings", "vision_encoder", "cross_modal_fusion" ] }, "token_tracking": { "log_frequency": 1000, "save_token_distribution": true, "monitor_caption_text_ratio": true, "enforce_token_limits": false, "early_stopping_on_limit": false, "track_alignment_quality": true, "log_misaligned_samples": true, "alignment_quality_threshold": 0.7, "save_alignment_statistics": true, "correlate_flops_with_tokens": true, "log_computational_efficiency": true, "track_throughput_vs_quality": true }, "huggingface_hub": { "enabled": true, "repo_id": "euhidaman/bitmar-attention-multimodal", "private": true, "upload_after_epoch": true, "upload_final_model": true, "commit_message_template": "BitMar 100M tokens - Epoch {epoch} - {tokens_processed:,} tokens processed", "create_model_card": true, "model_card_template": "---\nlanguage: en\nlicense: mit\ntags:\n- bitmar\n- multimodal\n- babylm\n- cross-modal\ndatasets:\n- babylm_multimodal\nmetrics:\n- bleu\n- cross_modal_similarity\n---\n\n# BitMar 100M Token Model\n\nThis model was trained on exactly 100 million tokens as part of the BabyLM challenge.\n\n## Training Details\n- Total tokens: 100,000,000\n- Epochs completed: {epoch}\n- Tokens processed: {tokens_processed:,}\n- Cross-modal similarity: {best_similarity:.4f}\n\n## Model Architecture\n- Text encoder: {text_encoder_layers} layers, {text_encoder_dim} hidden size\n- Vision encoder: DiNOv2 features compressed to {vision_latent_size}\n- Episodic memory: {memory_size} slots\n\n## Usage\n```python\nfrom transformers import AutoModel, AutoTokenizer\n\nmodel = AutoModel.from_pretrained(\"{repo_id}\")\ntokenizer = AutoTokenizer.from_pretrained(\"{repo_id}\")\n```\n" }, "attention_sinks": { "enabled": true, "attention_sink_size": 4, "attention_sink_window_size": 1020, "inject_to_text_encoder": true, "inject_to_text_decoder": true, "position_shift_enabled": true, "cache_compression": true, "adaptive_window_size": false, "memory_efficient_attention": true, "preserve_episodic_memory": true, "preserve_quantization": true, "preserve_cross_modal_fusion": true } } }