{
  "epoch": 9,
  "global_step": 994980,
  "tokens_processed": 996862184,
  "target_tokens": 100000000,
  "best_similarity": 0.4551965296268463,
  "training_config": {
    "model": {
      "vocab_size": 50257,
      "text_encoder_dim": 128,
      "text_encoder_layers": 4,
      "text_encoder_heads": 4,
      "text_decoder_dim": 128,
      "text_decoder_layers": 4,
      "text_decoder_heads": 4,
      "vision_encoder_dim": 768,
      "vision_latent_size": 128,
      "vision_hidden_size": 64,
      "vision_compression_method": "learned_compression",
      "vision_spatial_pooling": true,
      "vision_pool_size": 2,
      "fusion_hidden_size": 128,
      "fusion_num_heads": 4,
      "fusion_num_layers": 2,
      "memory_size": 32,
      "episode_dim": 128,
      "memory_alpha": 0.2,
      "direct_writing": true,
      "memory_compression": true,
      "enable_adaptive_training": true,
      "max_seq_len": 256,
      "dropout": 0.15
    },
    "token_constraints": {
      "total_tokens": 100000000,
      "caption_tokens": 50000000,
      "text_tokens": 50000000,
      "enforce_exact_count": true,
      "uniform_sampling": true,
      "alignment_priority": "perfect_alignment",
      "preserve_image_caption_pairs": true,
      "strict_alignment_validation": true
    },
    "vision_feature_reduction": {
      "enabled": true,
      "method": "learned_compression",
      "target_dim": 64,
      "spatial_pooling": true,
      "pool_method": "attention",
      "hidden_dim": 128,
      "learnable": true,
      "preserve_spatial_info": true
    },
    "data": {
      "dataset_dir": "../babylm_dataset",
      "text_encoder_name": "gpt2",
      "max_seq_length": 256,
      "count_tokens": true,
      "target_caption_tokens": 50000000,
      "target_text_tokens": 50000000,
      "token_counting_method": "gpt2",
      "batch_size": 64,
      "num_workers": 6,
      "pin_memory": true,
      "persistent_workers": true,
      "mix_ratio": 0.5,
      "shuffle_datasets": true,
      "ensure_alignment": true,
      "validate_alignment": true,
      "alignment_verification": "strict",
      "never_break_pairs": true,
      "alignment_check_frequency": 1000,
      "use_validation": false,
      "train_only": true
    },
    "attention_analysis": {
      "track_top_k": 5,
      "log_every_n_steps": 200,
      "viz_every_n_epochs": 3,
      "save_head_patterns": true,
      "analyze_memory_attention": true,
      "analyze_cross_modal": true,
      "track_token_alignment": true
    },
    "adaptive_training": {
      "enabled": true,
      "similarity_window_size": 200,
      "drop_threshold": 0.12,
      "min_steps_between_interventions": 800,
      "freeze_duration_steps": 1500,
      "loss_rebalance_factor": 2.0,
      "similarity_smoothing_alpha": 0.15
    },
    "training": {
      "max_epochs": 10,
      "accumulate_grad_batches": 2,
      "gradient_clip_val": 0.3,
      "val_check_interval": 1000,
      "scheduler": "cosine_with_restarts",
      "min_lr": 5e-05,
      "warmup_steps": 1000,
      "learning_rate": 0.0002,
      "weight_decay": 0.02,
      "optimizer": "adamw8bit",
      "scheduler_config": {
        "T_0": 1000,
        "T_mult": 2,
        "eta_min_ratio": 0.1
      },
      "cross_modal_loss_weight": 1.5,
      "text_generation_loss_weight": 1.0,
      "memory_regularization_weight": 0.1,
      "alignment_consistency_weight": 0.5,
      "track_token_usage": true,
      "log_token_progress": true,
      "stop_at_token_limit": false,
      "validate_alignment_every_n_steps": 500,
      "log_alignment_metrics": true,
      "alignment_loss_scaling": "adaptive"
    },
    "wandb": {
      "project": "bitmar-100M-attention-epochs",
      "entity": "babylm-ntust",
      "api_key": null,
      "log_every_n_steps": 100,
      "log_attention": true,
      "log_memory": true,
      "log_gradients": true,
      "log_token_usage": true,
      "log_cross_modal_similarity": true,
      "log_alignment_quality": true,
      "log_caption_image_matching": true,
      "save_code": true,
      "create_plots": true,
      "plot_attention_heatmaps": true,
      "plot_memory_usage": true,
      "plot_token_distribution": true,
      "plot_alignment_metrics": true,
      "log_memory_evolution": true,
      "plot_memory_evolution_heatmap": true,
      "plot_memory_diversity": true,
      "plot_memory_access_patterns": true,
      "memory_visualization_frequency": 5000,
      "memory_snapshot_frequency": 10000,
      "track_memory_metrics": [
        "memory_diversity_score",
        "memory_specialization_score",
        "memory_usage_entropy",
        "cross_modal_memory_ratio",
        "memory_slot_utilization",
        "memory_update_frequency",
        "memory_retrieval_accuracy"
      ]
    },
    "evaluation": {
      "metrics": [
        "bleu",
        "rouge",
        "cross_modal_similarity",
        "memory_efficiency"
      ],
      "generate_samples": true,
      "num_samples": 20,
      "max_generation_length": 32,
      "temperature": 0.8,
      "top_p": 0.9,
      "evaluate_alignment": true,
      "alignment_metrics": [
        "cosine_similarity",
        "retrieval_accuracy",
        "caption_image_matching",
        "cross_modal_retrieval"
      ],
      "alignment_threshold": 0.8,
      "validate_pairs_during_eval": true
    },
    "output": {
      "checkpoint_dir": "checkpoints_100M_dataset",
      "log_dir": "logs_100M_dataset",
      "attention_dir": "attention_100M_dataset",
      "memory_dir": "memory_100M_dataset",
      "results_dir": "results_100M_dataset",
      "token_logs_dir": "token_logs_100M_dataset"
    },
    "memory_optimization": {
      "use_gradient_checkpointing": true,
      "use_fp16": true,
      "use_int8_vision": false,
      "empty_cache_frequency": 10,
      "max_memory_slots_in_ram": 16,
      "compress_episodic_memory": true,
      "vision_feature_caching": false,
      "vision_batch_processing": true,
      "tie_word_embeddings": true,
      "use_shared_attention": false
    },
    "performance_targets": {
      "max_model_size_mb": 50,
      "target_cross_modal_similarity": 0.75,
      "target_text_generation_quality": 0.6,
      "memory_efficiency_threshold": 0.8
    },
    "flops_tracking": {
      "enabled": true,
      "log_frequency": 100,
      "save_statistics": true,
      "estimate_theoretical": true,
      "track_peak_performance": true,
      "log_to_wandb": true,
      "detailed_breakdown": true,
      "memory_bandwidth_tracking": false,
      "efficiency_analysis": true,
      "track_components": [
        "attention",
        "feedforward",
        "layer_norm",
        "embeddings",
        "vision_encoder",
        "cross_modal_fusion"
      ]
    },
    "token_tracking": {
      "log_frequency": 1000,
      "save_token_distribution": true,
      "monitor_caption_text_ratio": true,
      "enforce_token_limits": false,
      "early_stopping_on_limit": false,
      "track_alignment_quality": true,
      "log_misaligned_samples": true,
      "alignment_quality_threshold": 0.7,
      "save_alignment_statistics": true,
      "correlate_flops_with_tokens": true,
      "log_computational_efficiency": true,
      "track_throughput_vs_quality": true
    },
    "huggingface_hub": {
      "enabled": true,
      "repo_id": "euhidaman/bitmar-attention-multimodal",
      "private": true,
      "upload_after_epoch": true,
      "upload_final_model": true,
      "commit_message_template": "BitMar 100M tokens - Epoch {epoch} - {tokens_processed:,} tokens processed",
      "create_model_card": true,
      "model_card_template": "---\nlanguage: en\nlicense: mit\ntags:\n- bitmar\n- multimodal\n- babylm\n- cross-modal\ndatasets:\n- babylm_multimodal\nmetrics:\n- bleu\n- cross_modal_similarity\n---\n\n# BitMar 100M Token Model\n\nThis model was trained on exactly 100 million tokens as part of the BabyLM challenge.\n\n## Training Details\n- Total tokens: 100,000,000\n- Epochs completed: {epoch}\n- Tokens processed: {tokens_processed:,}\n- Cross-modal similarity: {best_similarity:.4f}\n\n## Model Architecture\n- Text encoder: {text_encoder_layers} layers, {text_encoder_dim} hidden size\n- Vision encoder: DiNOv2 features compressed to {vision_latent_size}\n- Episodic memory: {memory_size} slots\n\n## Usage\n```python\nfrom transformers import AutoModel, AutoTokenizer\n\nmodel = AutoModel.from_pretrained(\"{repo_id}\")\ntokenizer = AutoTokenizer.from_pretrained(\"{repo_id}\")\n```\n"
    },
    "attention_sinks": {
      "enabled": true,
      "attention_sink_size": 4,
      "attention_sink_window_size": 1020,
      "inject_to_text_encoder": true,
      "inject_to_text_decoder": true,
      "position_shift_enabled": true,
      "cache_compression": true,
      "adaptive_window_size": false,
      "memory_efficient_attention": true,
      "preserve_episodic_memory": true,
      "preserve_quantization": true,
      "preserve_cross_modal_fusion": true
    }
  }
}