File size: 10,173 Bytes
984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 ef24863 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 ef24863 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 984c806 6ad4659 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
"""
Extract high-value Caribbean entities from training transcripts.
This builds a gazetteer purely from the competition dataset (no external data).
Supports both single-word and multi-word entity extraction.
"""
import pandas as pd
import re
from typing import Set, Dict, List, Tuple
def extract_ngrams(words: List[str], n: int) -> List[Tuple[str, ...]]:
"""Extract n-grams from a list of words."""
return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
def is_phrase_capitalized(phrase_words: List[str]) -> bool:
"""Check if a phrase has proper capitalization (all words capitalized)."""
return all(word[0].isupper() if word else False for word in phrase_words)
def extract_entities_from_transcripts(train_df: pd.DataFrame,
min_frequency: int = 50,
min_frequency_multiword: int = 20,
capitalization_threshold: float = 0.7,
verbose: bool = True) -> Set[str]:
"""
Extract high-value entities from training transcripts based on:
1. Frequency (appears > min_frequency times)
2. Capitalization pattern (capitalized/ALLCAPS most of the time)
3. Multi-word phrase detection (bigrams, trigrams)
4. Proximity to known Caribbean keywords (optional filter)
Args:
train_df: DataFrame with 'transcription' column (lowercase)
min_frequency: Minimum occurrences for single-word entities
min_frequency_multiword: Minimum occurrences for multi-word entities
capitalization_threshold: Minimum ratio of capitalized occurrences (0-1)
verbose: Print progress and statistics
"""
# Known Caribbean keywords for context filtering
caribbean_keywords = {
"caribbean", "bbc", "report", "london", "port", "prime", "minister",
"trinidad", "tobago", "jamaica", "guyana", "haiti", "barbados",
"antigua", "dominica", "grenada", "montserrat", "lucia", "kitts",
"nevis", "suriname", "caricom", "west", "indies"
}
# Exclusion list: common words that are frequently capitalized but not entities
EXCLUDED_WORDS = {
# Single letters
"i", "u",
# Titles/honorifics
"mr", "mrs", "ms", "dr", "sir", "madam",
# Days of week
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
# Months
"january", "february", "march", "april", "may", "june",
"july", "august", "september", "october", "november", "december",
# Common nouns that start sentences (remove from single words, but allow in phrases)
"minister", "assembly", "council", "department", "secretariat",
"secretary", "parliament", "congress", "labour", "republic", "states",
"attorney", "association",
# Adjectives (nationality/descriptive)
"british", "american", "cuban", "haitian", "guyanese", "jamaican",
"trinidadian", "dominican", "african", "european", "indian", "dutch",
"french", "eastern", "latin", "south", "west", "north",
}
# Exclusion patterns for multi-word phrases
EXCLUDED_PHRASE_PATTERNS = {
# Generic time references
"last week", "this week", "next week", "last year", "this year", "next year",
"last month", "this month", "next month",
# Generic titles without names
"prime minister", "foreign minister", "finance minister",
# Articles and common phrases
"the report", "the government", "the country",
}
# Track word and phrase occurrences with capitalization info
word_stats: Dict[str, Dict[str, int]] = {}
phrase_stats: Dict[str, Dict[str, int]] = {}
if verbose:
print("\n[1/3] Analyzing single words and multi-word phrases...")
# Support both 'Transcription' (CSV) and 'transcription' (HF dataset)
transcription_col = 'transcription' if 'transcription' in train_df.columns else 'Transcription'
for transcription in train_df[transcription_col]:
if pd.isna(transcription):
continue
# Tokenize: split on whitespace and punctuation
words = re.findall(r'\b[A-Za-z]+\b', str(transcription))
# === SINGLE WORD EXTRACTION ===
for word in words:
word_lower = word.lower()
if word_lower not in word_stats:
word_stats[word_lower] = {
'total': 0,
'capitalized': 0,
'allcaps': 0,
'near_caribbean': 0
}
word_stats[word_lower]['total'] += 1
# Check capitalization
if word.isupper() and len(word) > 1:
word_stats[word_lower]['allcaps'] += 1
elif word[0].isupper():
word_stats[word_lower]['capitalized'] += 1
# === MULTI-WORD EXTRACTION (bigrams and trigrams) ===
# Extract bigrams (2-word phrases)
for i in range(len(words) - 1):
phrase = (words[i], words[i+1])
phrase_lower = ' '.join(w.lower() for w in phrase)
if phrase_lower not in phrase_stats:
phrase_stats[phrase_lower] = {
'total': 0,
'capitalized': 0,
}
phrase_stats[phrase_lower]['total'] += 1
# Check if phrase is properly capitalized
if is_phrase_capitalized(phrase):
phrase_stats[phrase_lower]['capitalized'] += 1
# Extract trigrams (3-word phrases)
for i in range(len(words) - 2):
phrase = (words[i], words[i+1], words[i+2])
phrase_lower = ' '.join(w.lower() for w in phrase)
if phrase_lower not in phrase_stats:
phrase_stats[phrase_lower] = {
'total': 0,
'capitalized': 0,
}
phrase_stats[phrase_lower]['total'] += 1
# Check if phrase is properly capitalized
if is_phrase_capitalized(phrase):
phrase_stats[phrase_lower]['capitalized'] += 1
if verbose:
print(f" - Analyzed {len(word_stats):,} unique words")
print(f" - Analyzed {len(phrase_stats):,} unique phrases")
# === FILTER SINGLE WORDS ===
if verbose:
print(f"\n[2/3] Filtering single-word entities...")
single_word_entities = set()
for word_lower, stats in word_stats.items():
# Minimum length filter
if len(word_lower) < 2:
continue
# Exclusion list filter
if word_lower in EXCLUDED_WORDS:
continue
# Frequency filter
if stats['total'] < min_frequency:
continue
# Capitalization filter
capitalized_ratio = (stats['capitalized'] + stats['allcaps']) / stats['total']
if capitalized_ratio < capitalization_threshold:
continue
single_word_entities.add(word_lower)
if verbose:
print(f" - Found {len(single_word_entities)} single-word entities")
# === FILTER MULTI-WORD PHRASES ===
if verbose:
print(f"\n[3/3] Filtering multi-word entities...")
multiword_entities = set()
for phrase_lower, stats in phrase_stats.items():
# Exclusion pattern filter
if phrase_lower in EXCLUDED_PHRASE_PATTERNS:
continue
# Frequency filter (lower threshold for multi-word)
if stats['total'] < min_frequency_multiword:
continue
# Capitalization filter
capitalized_ratio = stats['capitalized'] / stats['total']
if capitalized_ratio < capitalization_threshold:
continue
# Check if all component words would be valid entities or are "rescued" by context
# For example: "puerto rico" is valid even though "puerto" alone is excluded
words_in_phrase = phrase_lower.split()
# Allow if at least one word is a known entity or if it's a common multi-word pattern
has_known_entity = any(w in single_word_entities for w in words_in_phrase)
is_common_pattern = any(pattern in phrase_lower for pattern in
['port of', 'puerto', 'st ', ' st', 'prime minister', 'saint'])
if has_known_entity or is_common_pattern:
multiword_entities.add(phrase_lower)
if verbose:
print(f" - Found {len(multiword_entities)} multi-word entities")
# === COMBINE AND DEDUPLICATE ===
# Prefer longer entities over shorter ones when they overlap
all_entities = single_word_entities | multiword_entities
if verbose:
print(f"\n✓ Total entities: {len(all_entities)} ({len(single_word_entities)} single + {len(multiword_entities)} multi-word)")
# Show top entities by type
if single_word_entities:
entity_freqs = [(w, word_stats[w]['total']) for w in single_word_entities]
entity_freqs.sort(key=lambda x: x[1], reverse=True)
print("\nTop 20 single-word entities:")
for word, freq in entity_freqs[:20]:
stats = word_stats[word]
cap_ratio = (stats['capitalized'] + stats['allcaps']) / stats['total']
print(f" {word:25s} | freq={freq:5d} | cap_ratio={cap_ratio:.2f}")
if multiword_entities:
phrase_freqs = [(p, phrase_stats[p]['total']) for p in multiword_entities]
phrase_freqs.sort(key=lambda x: x[1], reverse=True)
print("\nTop 20 multi-word entities:")
for phrase, freq in phrase_freqs[:20]:
stats = phrase_stats[phrase]
cap_ratio = stats['capitalized'] / stats['total']
print(f" {phrase:25s} | freq={freq:5d} | cap_ratio={cap_ratio:.2f}")
return all_entities
|