File size: 2,730 Bytes
2941e2c
 
 
 
 
e3aec0d
 
2941e2c
 
 
 
 
 
 
 
 
e3aec0d
2941e2c
e3aec0d
 
 
 
2941e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3aec0d
2941e2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Entity extraction utilities."""
import os
import json
import pandas as pd
from extract_entities import extract_entities_from_transcripts
from data.manager import ENTITIES_PATH
from data.loader import get_train_dataframe


def extract_entities_progress(progress=None):
    """Extract Caribbean entities from training data with progress tracking"""
    try:
        if progress:
            progress(0, desc="Starting entity extraction...")
        
        if progress:
            progress(0.2, desc="Loading training data from dataset...")
        
        try:
            train_df = get_train_dataframe()
        except ValueError as e:
            return f"❌ {str(e)}", "{}"
        if progress:
            progress(0.4, desc=f"Analyzing {len(train_df):,} transcripts...")
        
        # Run extraction
        entities = extract_entities_from_transcripts(
            train_df,
            min_frequency=50,
            min_frequency_multiword=20,
            capitalization_threshold=0.7,
            verbose=False  # Suppress prints in Gradio app
        )
        
        if progress:
            progress(0.9, desc="Saving entities...")
        
        # Save to JSON
        entities_list = sorted(list(entities))
        single_word = sorted([e for e in entities if ' ' not in e])
        multi_word = sorted([e for e in entities if ' ' in e])
        
        output_data = {
            'entities': entities_list,
            'single_word_entities': single_word,
            'multi_word_entities': multi_word,
            'count': len(entities_list),
            'count_single_word': len(single_word),
            'count_multi_word': len(multi_word),
            'extraction_params': {
                'min_frequency': 50,
                'min_frequency_multiword': 20,
                'capitalization_threshold': 0.7
            }
        }
        
        with open(ENTITIES_PATH, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        if progress:
            progress(1.0, desc="Complete!")
        
        top_single = single_word[:15]
        top_multi = multi_word[:15]
        
        summary = f"""
## ✅ Entity Extraction Complete

**Total Entities:** {len(entities_list)} ({len(single_word)} single-word + {len(multi_word)} multi-word)

**Top 15 Single-Word Entities:**
{', '.join(top_single) if top_single else 'None'}

**Top 15 Multi-Word Entities:**
{', '.join(top_multi) if top_multi else 'None'}

**Saved to:** `{ENTITIES_PATH}`
"""
        
        return summary, json.dumps(output_data, indent=2)
        
    except Exception as e:
        import traceback
        error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
        return error_msg, "{}"