omarkamali commited on 3 days ago

Commit

90f3b6c

verified ·

1 Parent(s): 20e4a73

Upload all models and assets for avk (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +188 -142
models/embeddings/aligned/avk_128d.bin +3 -0
models/embeddings/aligned/avk_128d.meta.json +1 -0
models/embeddings/aligned/avk_128d.projection.npy +3 -0
models/embeddings/aligned/avk_128d_metadata.json +8 -0
models/embeddings/aligned/avk_32d.bin +3 -0
models/embeddings/aligned/avk_32d.meta.json +1 -0
models/embeddings/aligned/avk_32d.projection.npy +3 -0
models/embeddings/aligned/avk_32d_metadata.json +8 -0
models/embeddings/aligned/avk_64d.bin +3 -0
models/embeddings/aligned/avk_64d.meta.json +1 -0
models/embeddings/aligned/avk_64d.projection.npy +3 -0
models/embeddings/aligned/avk_64d_metadata.json +8 -0
models/embeddings/monolingual/avk_128d.bin +2 -2
models/embeddings/monolingual/avk_128d_metadata.json +1 -1
models/embeddings/monolingual/avk_32d.bin +2 -2
models/embeddings/monolingual/avk_32d_metadata.json +1 -1
models/embeddings/monolingual/avk_64d.bin +2 -2
models/embeddings/monolingual/avk_64d_metadata.json +1 -1
models/subword_markov/avk_markov_ctx1_subword.parquet +2 -2
models/subword_markov/avk_markov_ctx1_subword_metadata.json +1 -1
models/subword_markov/avk_markov_ctx2_subword.parquet +2 -2
models/subword_markov/avk_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/avk_markov_ctx3_subword.parquet +2 -2
models/subword_markov/avk_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/avk_markov_ctx4_subword.parquet +2 -2
models/subword_markov/avk_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/avk_2gram_subword.parquet +2 -2
models/subword_ngram/avk_2gram_subword_metadata.json +2 -2
models/subword_ngram/avk_3gram_subword.parquet +2 -2
models/subword_ngram/avk_3gram_subword_metadata.json +2 -2
models/subword_ngram/avk_4gram_subword.parquet +2 -2
models/subword_ngram/avk_4gram_subword_metadata.json +2 -2
models/subword_ngram/avk_5gram_subword.parquet +3 -0
models/subword_ngram/avk_5gram_subword_metadata.json +7 -0
models/tokenizer/avk_tokenizer_16k.model +2 -2
models/tokenizer/avk_tokenizer_16k.vocab +0 -0
models/tokenizer/avk_tokenizer_32k.model +2 -2
models/tokenizer/avk_tokenizer_32k.vocab +0 -0
models/tokenizer/avk_tokenizer_64k.model +2 -2
models/tokenizer/avk_tokenizer_64k.vocab +0 -0
models/tokenizer/avk_tokenizer_8k.model +2 -2
models/tokenizer/avk_tokenizer_8k.vocab +0 -0
models/vocabulary/avk_vocabulary.parquet +2 -2
models/vocabulary/avk_vocabulary_metadata.json +9 -9
models/word_markov/avk_markov_ctx1_word.parquet +2 -2
models/word_markov/avk_markov_ctx1_word_metadata.json +2 -2
models/word_markov/avk_markov_ctx2_word.parquet +2 -2
models/word_markov/avk_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: avk
-language_name: AVK
 language_family: constructed_other
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-constructed_other
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.690
   - name: best_isotropy
     type: isotropy
-    value: 0.8793
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# AVK - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **AVK** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,23 +90,23 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.687x | 3.69 | 0.2370% | 256,066 |
-| **16k** | 4.050x | 4.05 | 0.2604% | 233,136 |
-| **32k** | 4.380x | 4.38 | 0.2816% | 215,576 |
-| **64k** | 4.690x 🏆 | 4.69 | 0.3015% | 201,338 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Bifa Afrika Amerika Asia Europa Oceania Koblira Awalkera sanda`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁bifa ▁afrika ▁amerika ▁asia ▁europa ▁oceania ▁koblira ▁awalkera ▁sanda` | 9 |
-| 16k | `▁bifa ▁afrika ▁amerika ▁asia ▁europa ▁oceania ▁koblira ▁awalkera ▁sanda` | 9 |
-| 32k | `▁bifa ▁afrika ▁amerika ▁asia ▁europa ▁oceania ▁koblira ▁awalkera ▁sanda` | 9 |
-| 64k | `▁bifa ▁afrika ▁amerika ▁asia ▁europa ▁oceania ▁koblira ▁awalkera ▁sanda` | 9 |
 **Sample 2:** `Bifa Afrika Amerika Asia Europa Oceania Koblira Awalkera sanda`
@@ -119,7 +129,7 @@ Below are sample sentences tokenized with each vocabulary size:
 ### Key Findings
-- **Best Compression:** 64k achieves 4.690x compression
 - **Lowest UNK Rate:** 8k with 0.2370% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 4,363 | 12.09 | 65,566 | 38.8% | 59.5% |
-| **2-gram** | Subword | 284 🏆 | 8.15 | 3,342 | 63.4% | 99.6% |
-| **3-gram** | Word | 9,089 | 13.15 | 131,822 | 34.4% | 51.4% |
-| **3-gram** | Subword | 1,998 | 10.96 | 24,557 | 26.3% | 74.1% |
-| **4-gram** | Word | 16,964 | 14.05 | 222,393 | 30.3% | 44.4% |
-| **4-gram** | Subword | 7,482 | 12.87 | 124,823 | 17.3% | 50.8% |
 ### Top 5 N-grams by Size
@@ -154,7 +166,7 @@ Below are sample sentences tokenized with each vocabulary size:
 | 2 | `of life` | 25,896 |
 | 3 | `of the` | 24,998 |
 | 4 | `the world` | 24,670 |
-| 5 | `species of` | 24,652 |
 **3-grams (Word):**
@@ -173,45 +185,65 @@ Below are sample sentences tokenized with each vocabulary size:
 | 1 | `species of the world` | 24,652 |
 | 2 | `mammal species of the` | 24,652 |
 | 3 | `bak taneon zo pimtayar` | 15,309 |
-| 4 | `zo pimtayar vexala dem` | 15,224 |
-| 5 | `taneon zo pimtayar vexala` | 15,223 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `a _` | 684,240 |
-| 2 | `s _` | 476,785 |
-| 3 | `_ (` | 458,313 |
-| 4 | `e _` | 387,458 |
-| 5 | `_ v` | 360,515 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ : _` | 268,667 |
-| 2 | `u s _` | 176,828 |
-| 3 | `e s t` | 175,968 |
-| 4 | `_ v u` | 167,656 |
-| 5 | `u e s` | 166,553 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `u e s t` | 164,190 |
 | 2 | `_ v u e` | 163,879 |
 | 3 | `v u e s` | 163,702 |
 | 4 | `) _ v u` | 124,953 |
 | 5 | `e s t -` | 124,892 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 284
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~51% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.9061 | 1.874 | 5.53 | 115,135 | 9.4% |
-| **1** | Subword | 1.0381 | 2.053 | 7.87 | 900 | 0.0% |
-| **2** | Word | 0.2494 | 1.189 | 1.63 | 635,326 | 75.1% |
-| **2** | Subword | 0.9486 | 1.930 | 5.95 | 7,086 | 5.1% |
-| **3** | Word | 0.1397 | 1.102 | 1.31 | 1,030,372 | 86.0% |
-| **3** | Subword | 0.7945 | 1.734 | 4.30 | 42,171 | 20.6% |
-| **4** | Word | 0.1004 🏆 | 1.072 | 1.21 | 1,346,672 | 90.0% |
-| **4** | Subword | 0.6921 | 1.616 | 3.14 | 181,330 | 30.8% |
 ### Generated Text Samples (Word-based)
@@ -242,27 +274,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `en forsythe david friedrich germanaf suterotik kiren sini va kulak ke patecta divatcewer kazaxo koe ...`
-2. `vuest paleobiology database lagidium viscacia katca ctenomys johannis dene internet ok ino va jontik...`
-3. `ke zosteropidae yasa ke capensis philippsi hinton en vuest walvedeyafa zveriopafa aba leptotrygon ti...`
 **Context Size 2:**
-1. `en vuest paleobiology database pipistrellus kuhlii lepidus blyth en vuest animal diversity web leopa...`
-2. `of life procyon lotor grinnelli nelson and goldman tovumol thomomys umbrinus nelsoni merriam en vues...`
-3. `of the world siatos ke bata katca vas 17 oxi zo torigir ise va volkeafi is kategisafi`
 **Context Size 3:**
-1. `of the world siatos ke konakara apta dere tid ke mila veyafa katca putcuxol cephalophus ogilbyi putc...`
-2. `mammal species of the world v 3 leptailurus serval lonnbergi cabrera abrugol leptailurus serval beir...`
-3. `species of the world v 3 solenodontidae gill en vuest animal diversity web corythopis en vuest anima...`
 **Context Size 4:**
-1. `species of the world siatos ke bata katca tir aptiskafa pulasa vuestexa is xantaza en vuest mammal s...`
-2. `mammal species of the world v 3 sminthopsis griseoventer kitchener stoddart en fr vuest itis glaucom...`
-3. `bak taneon zo pimtayar vexala dem apteem sedme mammal species of the world siatos ke konakara apta d...`
 ### Generated Text Samples (Subword-based)
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_(_denata_(erus_`
-2. `asa_(_sideven_t_`
-3. `e_worva_(mi_rota`
 **Context Size 2:**
-1. `a_dem_sinzo_pala_`
-2. `s_rimallifa_jechy`
-3. `_(_puldaegan_baka`
 **Context Size 3:**
-1. `_:_citesa_)_ke_cou`
-2. `us_flowasinafa._13`
-3. `est-_:_pert_ke_cou`
 **Context Size 4:**
-1. `uestexa_is_katceem_`
-2. `_vuest-_:_cites_zib`
-3. `vuestexa_iku_hulske`
 ### Key Findings
-- **Best Predictability:** Context-4 (word) with 90.0% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (181,330 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,26 +346,26 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 58,132 |
-| Total Tokens | 3,516,474 |
-| Mean Frequency | 60.49 |
 | Median Frequency | 5 |
-| Frequency Std Dev | 1081.22 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | en | 127,536 |
 | 2 | vuest | 124,885 |
-| 3 | ke | 85,674 |
-| 4 | of | 52,510 |
 | 5 | tir | 40,501 |
-| 6 | is | 37,526 |
-| 7 | katca | 36,175 |
-| 8 | va | 35,605 |
-| 9 | bak | 28,769 |
-| 10 | koe | 28,642 |
 ### Least Common Words (from vocabulary)
@@ -354,24 +386,24 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.1323 |
-| R² (Goodness of Fit) | 0.996890 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 48.8% |
-| Top 1,000 | 72.1% |
-| Top 5,000 | 86.1% |
 | Top 10,000 | 91.0% |
 ### Key Findings
 - **Zipf Compliance:** R²=0.9969 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 48.8% of corpus
-- **Long Tail:** 48,132 words needed for remaining 9.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.8793 🏆 | 0.3481 | N/A | N/A |
-| **mono_64d** | 64 | 0.8305 | 0.2964 | N/A | N/A |
-| **mono_128d** | 128 | 0.6711 | 0.2516 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.8793 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.2987. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -426,18 +461,19 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-a` | engada, winkapa, riftakola |
-| `-s` | latimanus, hyladelphys, adocetus |
-| `-us` | latimanus, adocetus, eptesicus |
-| `-ra` | rupera, remtrakura, prosthemadera |
-| `-on` | daemon, lavion, prostelayon |
-| `-fa` | altokafa, kalkafa, ronepafa |
-| `-afa` | altokafa, kalkafa, ronepafa |
-| `-is` | africaeaustralis, variabilis, louis |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -445,25 +481,35 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `ensi` | 2.40x | 45 contexts | owensi, pensil, ozensis |
-| `ayar` | 1.87x | 93 contexts | gayar, vayar, iayar |
-| `urus` | 2.16x | 23 contexts | purus, urusí, gaurus |
-| `anta` | 1.52x | 73 contexts | yanta, danta, canta |
-| `imta` | 2.04x | 22 contexts | pimtas, kimtaf, krimta |
-| `tava` | 1.80x | 25 contexts | stava, kotava, yultava |
-| `atca` | 1.63x | 31 contexts | zatca, datca, catca |
-| `pimt` | 2.38x | 8 contexts | pimtas, pimtar, pimtan |
-| `stes` | 1.74x | 16 contexts | restes, lestes, estesa |
-| `neon` | 2.09x | 8 contexts | roneon, taneon, deneon |
-| `xant` | 1.53x | 19 contexts | xanta, xanto, xantik |
-| `katc` | 1.55x | 14 contexts | katca, katcaf, katcaal |
 ### 6.4 Affix Compatibility (Co-occurrence)
 This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
-*No significant affix co-occurrences detected.*
 ### 6.5 Recursive Morpheme Segmentation
@@ -471,26 +517,26 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
-| rumeikafa | **`rumeik-afa`** | 4.5 | `rumeik` |
-| dolekikafa | **`dolekik-afa`** | 4.5 | `dolekik` |
-| unenikafa | **`unenik-afa`** | 4.5 | `unenik` |
-| tetschener | **`tetschen-er`** | 4.5 | `tetschen` |
-| jotugalafa | **`jotugal-afa`** | 4.5 | `jotugal` |
-| gogolason | **`gogolas-on`** | 4.5 | `gogolas` |
-| rontagentimafa | **`rontagentim-afa`** | 4.5 | `rontagentim` |
-| getalteon | **`getalte-on`** | 4.5 | `getalte` |
-| azilnyofara | **`azilnyo-fa-ra`** | 3.0 | `azilnyo` |
-| tunotrara | **`tunot-ra-ra`** | 3.0 | `tunot` |
-| dimpiyison | **`dimpiy-is-on`** | 3.0 | `dimpiy` |
-| otonycteris | **`otonyct-er-is`** | 3.0 | `otonyct` |
-| rhinonicteris | **`rhinonict-er-is`** | 3.0 | `rhinonict` |
-| chrotopterus | **`chrotopt-er-us`** | 3.0 | `chrotopt` |
-| talturonon | **`taltur-on-on`** | 3.0 | `taltur` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language AVK appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -503,7 +549,7 @@ The language AVK appears to be more isolating or has a highly fixed vocabulary.
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (4.69x) |
 | N-gram | **2-gram** | Lowest perplexity (284) |
-| Markov | **Context-4** | Highest predictability (90.0%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -717,4 +763,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 05:31:10*

 ---
 language: avk
+language_name: Kotava
 language_family: constructed_other
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-constructed_other
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.689
   - name: best_isotropy
     type: isotropy
+    value: 0.8768
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Kotava - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Kotava** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.689x | 3.69 | 0.2370% | 255,266 |
+| **16k** | 4.051x | 4.06 | 0.2603% | 232,417 |
+| **32k** | 4.380x | 4.39 | 0.2815% | 214,947 |
+| **64k** | 4.689x 🏆 | 4.69 | 0.3013% | 200,817 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Victoria tir kelu is lozolonafa widava ke Seycella tigisa valente patecta koe Ma...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁victor ia ▁tir ▁kelu ▁is ▁lozolonafa ▁widava ▁ke ▁s ey ... (+13 more)` | 23 |
+| 16k | `▁victoria ▁tir ▁kelu ▁is ▁lozolonafa ▁widava ▁ke ▁sey c ella ... (+10 more)` | 20 |
+| 32k | `▁victoria ▁tir ▁kelu ▁is ▁lozolonafa ▁widava ▁ke ▁sey c ella ... (+10 more)` | 20 |
+| 64k | `▁victoria ▁tir ▁kelu ▁is ▁lozolonafa ▁widava ▁ke ▁seycella ▁tigisa ▁valente ... (+8 more)` | 18 |
 **Sample 2:** `Bifa Afrika Amerika Asia Europa Oceania Koblira Awalkera sanda`
 ### Key Findings
+- **Best Compression:** 64k achieves 4.689x compression
 - **Lowest UNK Rate:** 8k with 0.2370% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 4,342 | 12.08 | 65,378 | 38.8% | 59.6% |
+| **2-gram** | Subword | 284 🏆 | 8.15 | 3,324 | 63.4% | 99.6% |
+| **3-gram** | Word | 9,058 | 13.14 | 131,536 | 34.4% | 51.5% |
+| **3-gram** | Subword | 1,996 | 10.96 | 24,495 | 26.3% | 74.2% |
+| **4-gram** | Word | 16,918 | 14.05 | 222,038 | 30.4% | 44.4% |
+| **4-gram** | Subword | 7,464 | 12.87 | 124,607 | 17.4% | 50.9% |
+| **5-gram** | Word | 18,754 | 14.19 | 212,819 | 28.7% | 42.2% |
+| **5-gram** | Subword | 17,155 | 14.07 | 346,727 | 14.4% | 42.8% |
 ### Top 5 N-grams by Size
 | 2 | `of life` | 25,896 |
 | 3 | `of the` | 24,998 |
 | 4 | `the world` | 24,670 |
+| 5 | `mammal species` | 24,652 |
 **3-grams (Word):**
 | 1 | `species of the world` | 24,652 |
 | 2 | `mammal species of the` | 24,652 |
 | 3 | `bak taneon zo pimtayar` | 15,309 |
+| 4 | `zo pimtayar vexala dem` | 15,226 |
+| 5 | `taneon zo pimtayar vexala` | 15,225 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `mammal species of the world` | 24,652 |
+| 2 | `taneon zo pimtayar vexala dem` | 15,225 |
+| 3 | `bak taneon zo pimtayar vexala` | 14,992 |
+| 4 | `en vuest animal diversity web` | 14,121 |
+| 5 | `en vuest catalogue of life` | 14,116 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `a _` | 682,687 |
+| 2 | `s _` | 476,530 |
+| 3 | `_ (` | 458,247 |
+| 4 | `e _` | 386,463 |
+| 5 | `_ v` | 360,083 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ : _` | 268,658 |
+| 2 | `u s _` | 176,817 |
+| 3 | `e s t` | 175,950 |
+| 4 | `_ v u` | 167,654 |
+| 5 | `u e s` | 166,548 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `u e s t` | 164,186 |
 | 2 | `_ v u e` | 163,879 |
 | 3 | `v u e s` | 163,702 |
 | 4 | `) _ v u` | 124,953 |
 | 5 | `e s t -` | 124,892 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ v u e s` | 163,700 |
+| 2 | `v u e s t` | 163,699 |
+| 3 | `u e s t -` | 124,886 |
+| 4 | `e s t - _` | 124,885 |
+| 5 | `) _ v u e` | 124,841 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 284
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~43% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.9054 | 1.873 | 5.52 | 115,002 | 9.5% |
+| **1** | Subword | 1.0377 | 2.053 | 7.85 | 900 | 0.0% |
+| **2** | Word | 0.2492 | 1.189 | 1.63 | 633,477 | 75.1% |
+| **2** | Subword | 0.9459 | 1.926 | 5.95 | 7,069 | 5.4% |
+| **3** | Word | 0.1398 | 1.102 | 1.31 | 1,026,801 | 86.0% |
+| **3** | Subword | 0.7949 | 1.735 | 4.30 | 42,037 | 20.5% |
+| **4** | Word | 0.1005 🏆 | 1.072 | 1.21 | 1,342,358 | 89.9% |
+| **4** | Subword | 0.6925 | 1.616 | 3.14 | 180,930 | 30.8% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `en fr vuest paleobiology database dipodomys heermanni heermanni jolonensis grinnell and chiefdom a a...`
+2. `vuest paleobiology database xerus erythropus leucoumbrinus rüppell en vuest paleobiology database bu...`
+3. `ke otomops johnstonei en vuest itis rusa bak taneon zo bendeyer ewava vuestexa is xantaza en`
 **Context Size 2:**
+1. `en vuest walvedeyafa zveriopafa aba 2 2 siatos 5 katca oxi phonygammus 1 katca proklano philemon pro...`
+2. `of life dicerorhinus sumatrensis lasiotis en vuest ncbi campicoloides fr en vuest mammal species of ...`
+3. `of the world siatos ke konakara apta dere tid ke mila veyafa katca vesnol nycticeius humeralis humer...`
 **Context Size 3:**
+1. `of the world v 3 petrogale purpureicollis le souef en vuest cites ctenomys colburni en vuest uicn ka...`
+2. `species of the world v 3 isolobodon portoricensis j a allen vesnol myotis yumanensis sociabilis h w ...`
+3. `mammal species of the world siatos ke konakara apta dere tid ke mila veyafa katca vesnol lonchorhina...`
 **Context Size 4:**
+1. `species of the world siatos ke bata katca tir aptiskafa dere rupel pulasa vuestexa is xantaza en vue...`
+2. `mammal species of the world siatos ke bata katca tir aptiskafa pulasa vuestexa is xantaza en vuest m...`
+3. `bak taneon zo pimtayar vexala dem katceem sedme vuestesa pulara ke walvedeyafa zveriopafa aba 2 2 si...`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_ves_wirtis_cada`
+2. `aldronururuda_a,`
+3. `e_oe_s_ta_tcimot`
 **Context Size 2:**
+1. `a_:_cathirojunafa`
+2. `s_:_le="vey_tazne`
+3. `_(heropanelterifo`
 **Context Size 3:**
+1. `_:_burnata_kuksa_(`
+2. `us_paleobiologue_o`
+3. `ested_nudingus_vor`
 **Context Size 4:**
+1. `uest-_:_uicn_:_acom`
+2. `_vuestesa_vaticus_p`
+3. `vuest-_:_mephitis_:`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 89.9% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (180,930 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 58,045 |
+| Total Tokens | 3,510,675 |
+| Mean Frequency | 60.48 |
 | Median Frequency | 5 |
+| Frequency Std Dev | 1080.85 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | en | 127,527 |
 | 2 | vuest | 124,885 |
+| 3 | ke | 85,172 |
+| 4 | of | 52,509 |
 | 5 | tir | 40,501 |
+| 6 | is | 37,459 |
+| 7 | katca | 36,160 |
+| 8 | va | 35,241 |
+| 9 | bak | 28,713 |
+| 10 | koe | 28,499 |
 ### Least Common Words (from vocabulary)
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.1330 |
+| R² (Goodness of Fit) | 0.996896 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 48.9% |
+| Top 1,000 | 72.2% |
+| Top 5,000 | 86.2% |
 | Top 10,000 | 91.0% |
 ### Key Findings
 - **Zipf Compliance:** R²=0.9969 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 48.9% of corpus
+- **Long Tail:** 48,045 words needed for remaining 9.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.8768 🏆 | 0.3464 | N/A | N/A |
+| **mono_64d** | 64 | 0.8339 | 0.2956 | N/A | N/A |
+| **mono_128d** | 128 | 0.6767 | 0.2580 | N/A | N/A |
+| **aligned_32d** | 32 | 0.8768 | 0.3495 | 0.0440 | 0.2440 |
+| **aligned_64d** | 64 | 0.8339 | 0.2976 | 0.0760 | 0.3520 |
+| **aligned_128d** | 128 | 0.6767 | 0.2493 | 0.1320 | 0.4720 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.8768 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2994. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 13.2% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **-0.015** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
+| `-ma` | maltadleks, marnatum, marco |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-a` | teguina, coa, klaba |
+| `-s` | verticalis, mees, tellus |
+| `-us` | tellus, scapanulus, catagonus |
+| `-ra` | tara, aliera, mallanira |
+| `-er` | edobeyer, walzer, palliser |
+| `-is` | verticalis, anhuiensis, quitensis |
+| `-on` | goreston, styron, laizon |
+| `-fa` | kaikifa, isteamerikafa, lopinafa |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `ayar` | 2.06x | 93 contexts | vayar, wayar, iayar |
+| `ensi` | 2.30x | 45 contexts | pensil, owensi, hensies |
+| `anta` | 1.76x | 73 contexts | canta, tanta, xanta |
+| `urus` | 2.30x | 23 contexts | purus, urusí, myurus |
+| `imta` | 2.02x | 22 contexts | pimtas, pimtad, pimtan |
+| `tava` | 1.80x | 25 contexts | stava, kotava, poltava |
+| `atca` | 1.64x | 31 contexts | zatca, datca, catca |
+| `pimt` | 2.34x | 8 contexts | pimtas, pimtad, pimtan |
+| `stes` | 1.71x | 16 contexts | lestes, wastes, restes |
+| `xant` | 1.51x | 19 contexts | xanta, xanto, xantik |
+| `neon` | 2.03x | 8 contexts | deneon, keneon, roneon |
+| `ukol` | 1.51x | 14 contexts | bukol, stukol, moukol |
 ### 6.4 Affix Compatibility (Co-occurrence)
 This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-ma` | `-a` | 34 words | mafia, malaya |
+| `-ma` | `-s` | 29 words | maculicollis, mangas |
+| `-ma` | `-is` | 13 words | maculicollis, managuensis |
+| `-ma` | `-us` | 8 words | macrocephalicus, mastus |
+| `-ma` | `-ra` | 7 words | malyerara, mallapira |
+| `-ma` | `-on` | 5 words | maubuisson, malsaveson |
+| `-ma` | `-er` | 5 words | malgruper, mayasquer |
+| `-ma` | `-es` | 4 words | manzanares, macropodiformes |
+| `-ma` | `-fa` | 4 words | magyarafa, malyoparafa |
+| `-ma` | `-afa` | 4 words | magyarafa, malyoparafa |
 ### 6.5 Recursive Morpheme Segmentation
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
+| balumarafa | **`balumar-afa`** | 4.5 | `balumar` |
+| vageroneon | **`vagerone-on`** | 4.5 | `vagerone` |
+| koridanikafa | **`koridanik-afa`** | 4.5 | `koridanik` |
+| lidarotifa | **`lidaroti-fa`** | 4.5 | `lidaroti` |
+| pacificus | **`pacific-us`** | 4.5 | `pacific` |
+| yambikafa | **`yambik-afa`** | 4.5 | `yambik` |
+| zimmerius | **`zimmeri-us`** | 4.5 | `zimmeri` |
+| christies | **`christi-es`** | 4.5 | `christi` |
+| bristutuson | **`bristut-us-on`** | 3.0 | `bristut` |
+| aultoveson | **`aultov-es-on`** | 3.0 | `aultov` |
+| promeneuses | **`promene-us-es`** | 3.0 | `promene` |
+| stakseson | **`staks-es-on`** | 3.0 | `staks` |
+| atlantoxerus | **`atlantox-er-us`** | 3.0 | `atlantox` |
+| mantukafa | **`ma-ntuk-afa`** | 3.0 | `ntuk` |
+| ruyatakoler | **`ruyatakol-er`** | 1.5 | `ruyatakol` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Kotava shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
 ---
 ## 7. Summary & Recommendations
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (4.69x) |
 | N-gram | **2-gram** | Lowest perplexity (284) |
+| Markov | **Context-4** | Highest predictability (89.9%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 17:46:55*

models/embeddings/aligned/avk_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d5bc2fb68fc16599a7df591a4faa9874fafbac2b4321646c3eb80ee929289b8
+size 1070216041

models/embeddings/aligned/avk_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "avk", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/avk_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:046c39f51199072057fb8146454cf806db4867200588a1767bc05ce312ff7760
+size 65664

models/embeddings/aligned/avk_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "avk",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 10167,
+  "vocab_size": 44349
+}

models/embeddings/aligned/avk_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8853f84ade1376f0ea457f44f0519aedf712c23eecc35494f3e963a13c362e51
+size 268156009

models/embeddings/aligned/avk_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "avk", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/avk_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eca53a6013e2037c2f47dc5e1949fc0a2b6ece6075e88cf7212d0602dbd8c807
+size 4224

models/embeddings/aligned/avk_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "avk",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 10167,
+  "vocab_size": 44349
+}

models/embeddings/aligned/avk_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab0343f70502c04d2f85016d1052a4e74696fa24885f3ad8392e25d726279437
+size 535509353

models/embeddings/aligned/avk_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "avk", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/avk_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10505483a833ed429a570df9d39c4f5ab1cd9c736b7ba77d0c81d45ffd168609
+size 16512

models/embeddings/aligned/avk_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "avk",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 10167,
+  "vocab_size": 44349
+}

models/embeddings/monolingual/avk_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c0357f4c0fa47381c2d92e0415911327b53c389a3d486edc400fcce551ae310d
-size 1070325438

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d5bc2fb68fc16599a7df591a4faa9874fafbac2b4321646c3eb80ee929289b8
+size 1070216041

models/embeddings/monolingual/avk_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 44454
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 44349
 }

models/embeddings/monolingual/avk_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:829d093f87f92bee34bc2d9b3ee3c6e53006f7208dba8cd714618f0305b957af
-size 268184766

 version https://git-lfs.github.com/spec/v1
+oid sha256:8853f84ade1376f0ea457f44f0519aedf712c23eecc35494f3e963a13c362e51
+size 268156009

models/embeddings/monolingual/avk_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 44454
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 44349
 }

models/embeddings/monolingual/avk_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d210dae793291e501a5a742665aafce5364647858baa5dd956782a0c6d80a70
-size 535564990

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab0343f70502c04d2f85016d1052a4e74696fa24885f3ad8392e25d726279437
+size 535509353

models/embeddings/monolingual/avk_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 44454
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 44349
 }

models/subword_markov/avk_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21791c83470a53c161c68f122259b7897a3a907d75078d307676c0177d2c36bb
-size 63469

 version https://git-lfs.github.com/spec/v1
+oid sha256:f65687f254c652f69ef9be8131bf19edbff47526c12511f028d24075dae452c9
+size 63478

models/subword_markov/avk_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "variant": "subword",
   "language": "avk",
   "unique_contexts": 900,
-  "total_transitions": 26140715
 }

   "variant": "subword",
   "language": "avk",
   "unique_contexts": 900,
+  "total_transitions": 26104447
 }

models/subword_markov/avk_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:698bcc0cfe9a7f9b86fa6f44a6777b8a6a41971070d8ab3b73f98b09f24b3b36
-size 379498

 version https://git-lfs.github.com/spec/v1
+oid sha256:87c2c934c6a6a0e852b7510fffdc0cfa7afde5724d65c615397f5807bb5d2890
+size 374727

models/subword_markov/avk_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "avk",
-  "unique_contexts": 7086,
-  "total_transitions": 26111423
 }

   "context_size": 2,
   "variant": "subword",
   "language": "avk",
+  "unique_contexts": 7069,
+  "total_transitions": 26075153
 }

models/subword_markov/avk_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e97b0774917adc7e1e8666a420b5a88cece271cf99ea1b5f68cd532f0018a92b
-size 1583547

 version https://git-lfs.github.com/spec/v1
+oid sha256:c8486b2f7957071361dbd00d231776b15d9bd46e7798b78454470549c572886c
+size 1574087

models/subword_markov/avk_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "avk",
-  "unique_contexts": 42171,
-  "total_transitions": 26082131
 }

   "context_size": 3,
   "variant": "subword",
   "language": "avk",
+  "unique_contexts": 42037,
+  "total_transitions": 26045859
 }

models/subword_markov/avk_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aecee602f0b44cb0f438169dfcceb1a8a89110470f4c323a12fc42d4af2e25ae
-size 4497423

 version https://git-lfs.github.com/spec/v1
+oid sha256:09a5d485439f551617d7e6db1b1f5e923031936d395de0d0b6c395dac427ea25
+size 4492959

models/subword_markov/avk_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "avk",
-  "unique_contexts": 181330,
-  "total_transitions": 26052839
 }

   "context_size": 4,
   "variant": "subword",
   "language": "avk",
+  "unique_contexts": 180930,
+  "total_transitions": 26016565
 }

models/subword_ngram/avk_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:542c6d1e24d5e6dfba739d8aa22907d19ad252430bc7d0ccd256a501d0531f72
-size 47284

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed6e7b7ad31ba22fe41824beee6908fc8170fb2f1f76c61addff4e9b04c9932e
+size 47151

models/subword_ngram/avk_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "avk",
-  "unique_ngrams": 3342,
-  "total_ngrams": 26140715
 }

   "n": 2,
   "variant": "subword",
   "language": "avk",
+  "unique_ngrams": 3324,
+  "total_ngrams": 26104447
 }

models/subword_ngram/avk_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a059662c6d88d086a3fb221455feeb7c778b83b67df81086daea2aa0cddec23c
-size 326709

 version https://git-lfs.github.com/spec/v1
+oid sha256:223425e4b5985727026ac6e9aa9d270f255e60307dc67feb52135ab124aa71b7
+size 325386

models/subword_ngram/avk_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "avk",
-  "unique_ngrams": 24557,
-  "total_ngrams": 26111423
 }

   "n": 3,
   "variant": "subword",
   "language": "avk",
+  "unique_ngrams": 24495,
+  "total_ngrams": 26075153
 }

models/subword_ngram/avk_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:828f99a99e0050489b41c4a9df2305254560e588c8d8d5ec02dd0d2eb1cd3530
-size 1449644

 version https://git-lfs.github.com/spec/v1
+oid sha256:46348619ec1f35d5b9b5a791b6e52c986212e910495139645ef401b6c5b19293
+size 1466527

models/subword_ngram/avk_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "avk",
-  "unique_ngrams": 124823,
-  "total_ngrams": 26082131
 }

   "n": 4,
   "variant": "subword",
   "language": "avk",
+  "unique_ngrams": 124607,
+  "total_ngrams": 26045859
 }

models/subword_ngram/avk_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0eeca6699c9f43d66da4ebc2b55d17aca64e6e1da22891f7c56ba5685a78bf8
+size 3979959

models/subword_ngram/avk_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "avk",
+  "unique_ngrams": 346727,
+  "total_ngrams": 26016565
+}

models/tokenizer/avk_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc42b2323e7bee7429a820f2ea7e2009b2fb74e192b3fe3a17923335350d22c5
-size 510898

 version https://git-lfs.github.com/spec/v1
+oid sha256:61aa561358a08bbf10b56396e1e242ff4daf76f48bd3e1430c9a21c6646c8e63
+size 510862

models/tokenizer/avk_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/avk_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91bac420b4e318b698464d40a4467873b19d48b58df745bb29a08985b8c357b4
-size 793808

 version https://git-lfs.github.com/spec/v1
+oid sha256:0f3b72a63d1014b526eb5279776802d09220a24034b269459c333a9a664fa6c3
+size 793698

models/tokenizer/avk_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/avk_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f27f3f01c3515fa7385b2952b4f6df13e378c5cd044085d3bbce4d2ad95a544a
-size 1369364

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2448f2ba279b7033939c7a46d2a2a9dc156e5a3720bd6cfdf679c543a76cae1
+size 1369207

models/tokenizer/avk_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/avk_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6cee4ebbae3d1c7add217a4a460590ce0d0dd21f13b527e6db5ed0db19b09351
-size 374235

 version https://git-lfs.github.com/spec/v1
+oid sha256:65b827ee5b00ced41dcc33bbebdcf83a04cc3e86c492efa730c760a58ceb5ffe
+size 374206

models/tokenizer/avk_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/avk_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d2f858e168566644dadfc0198deb685658a860427034fac7e9621ba6c6c331f
-size 1016629

 version https://git-lfs.github.com/spec/v1
+oid sha256:01d722fcf181d81cacc7fc5650bf4964605a556a442201ecc9e3bcf942c24bb0
+size 1019369

models/vocabulary/avk_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "avk",
-  "vocabulary_size": 58132,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.032232319628378144,
     "coverage": {
-      "top_100": 0.48045109520711343,
-      "top_1000": 0.7096278884294919,
-      "top_5000": 0.8476448324833323,
-      "top_10000": 0.8952395743698449
     },
-    "hapax_count": 57051,
-    "hapax_ratio": 0.4953074672477709,
-    "total_documents": 29292
   }
 }

 {
   "language": "avk",
+  "vocabulary_size": 58045,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.032246491062106274,
     "coverage": {
+      "top_100": 0.48079183221565863,
+      "top_1000": 0.7100994345056655,
+      "top_5000": 0.8479087921405397,
+      "top_10000": 0.8953996650479654
     },
+    "hapax_count": 57000,
+    "hapax_ratio": 0.495458298926507,
+    "total_documents": 29294
   }
 }

models/word_markov/avk_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49390c536dd83430c0715d46b765fc2890bf4f4d31c5442852be6a75add7a744
-size 5111239

 version https://git-lfs.github.com/spec/v1
+oid sha256:555e68cdc2c21bda05bd3d25873880af9c89722f60875a1a12dbc25a7c979c16
+size 5073388

models/word_markov/avk_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "avk",
-  "unique_contexts": 115135,
-  "total_transitions": 3544233
 }

   "context_size": 1,
   "variant": "word",
   "language": "avk",
+  "unique_contexts": 115002,
+  "total_transitions": 3538381
 }

models/word_markov/avk_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08a9aef65566043a63c8658765edb2f951063117cc9f1704b722bd3a5d81e0ca
-size 11858936

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c519efe15d5791630fec68525a6c287b347a81e0691dbc26e51750bdee13659
+size 11825183

models/word_markov/avk_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "avk",
-  "unique_contexts": 635326,
-  "total_transitions": 3514941
 }

   "context_size": 2,
   "variant": "word",
   "language": "avk",
+  "unique_contexts": 633477,
+  "total_transitions": 3509087
 }