Spaces:

atlasia
/

darijaTransliterator

Sleeping

App Files Files Community

Haitam03 commited on Sep 27

Commit

4d7afb5

verified ·

1 Parent(s): 741c6b3

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -189

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
-# Local version with additional development features
 import gradio as gr
 import torch
 import torch.nn as nn
 import json
-import csv
 import os
 from datetime import datetime
 from torch.nn.utils.rnn import pad_sequence
-# Enable better error messages for local development
-import traceback
-# Define the model architecture (same as your training code)
 class CTCTransliterator(nn.Module):
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers=3, dropout=0.3):
         super().__init__()
@@ -31,150 +28,178 @@ class CTCTransliterator(nn.Module):
         x = x.log_softmax(dim=2)
         return x
-# Cache system for data collection
-class TransliterationCache:
-    def __init__(self, cache_file="transliteration_cache.csv"):
-        self.cache_file = cache_file
-        self.cache = {}
-        self.load_cache()
-    def load_cache(self):
-        """Load existing cache from file"""
-        if os.path.exists(self.cache_file):
-            try:
-                with open(self.cache_file, 'r', encoding='utf-8') as f:
-                    reader = csv.DictReader(f)
-                    for row in reader:
-                        key = f"{row['input']}_{row['direction']}"
-                        self.cache[key] = {
-                            'output': row['output'],
-                            'corrected_output': row.get('corrected_output', ''),
-                            'timestamp': row['timestamp'],
-                            'usage_count': int(row.get('usage_count', 1))
-                        }
-                print(f"Loaded {len(self.cache)} cached translations")
-            except Exception as e:
-                print(f"Error loading cache: {e}")
-    def save_cache(self):
-        """Save cache to file"""
         try:
-            with open(self.cache_file, 'w', encoding='utf-8', newline='') as f:
-                fieldnames = ['input', 'direction', 'output', 'corrected_output', 'timestamp', 'usage_count']
-                writer = csv.DictWriter(f, fieldnames=fieldnames)
-                writer.writeheader()
-                for key, data in self.cache.items():
-                    input_text, direction = key.rsplit('_', 1)
-                    writer.writerow({
-                        'input': input_text,
-                        'direction': direction,
-                        'output': data['output'],
-                        'corrected_output': data.get('corrected_output', ''),
-                        'timestamp': data['timestamp'],
-                        'usage_count': data['usage_count']
-                    })
-            print(f"Cache saved with {len(self.cache)} entries")
         except Exception as e:
-            print(f"Error saving cache: {e}")
-    def get(self, input_text, direction):
-        """Get cached result if exists"""
         key = f"{input_text}_{direction}"
-        if key in self.cache:
-            self.cache[key]['usage_count'] += 1
-            print(f"Using cached result for: {input_text}")
-            self.save_cache()
-            return self.cache[key]['output']
-        return None
     def set(self, input_text, direction, output):
-        """Cache a new result"""
-        key = f"{input_text}_{direction}"
-        self.cache[key] = {
-            'output': output,
-            'corrected_output': '',
-            'timestamp': datetime.now().isoformat(),
-            'usage_count': 1
-        }
-        print(f"Cached new translation: {input_text} → {output}")
-        self.save_cache()
     def update_correction(self, input_text, direction, corrected_output):
-        """Update with user correction"""
-        key = f"{input_text}_{direction}"
-        if key in self.cache:
-            self.cache[key]['corrected_output'] = corrected_output
             print(f"Correction saved: {input_text} → {corrected_output}")
-            self.save_cache()
             return True
-        return False
     def get_stats(self):
-        """Get cache statistics for development"""
-        total = len(self.cache)
-        corrected = sum(1 for item in self.cache.values() if item.get('corrected_output'))
-        most_used = max(self.cache.values(), key=lambda x: x['usage_count'], default={'usage_count': 0})
-        return {
-            'total_translations': total,
-            'corrected_translations': corrected,
-            'most_used_count': most_used['usage_count']
-        }
 # Load vocabularies and model
 def load_model_and_vocabs():
-    print("Loading model and vocabularies...")
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f"Using device: {device}")
-    try:
-        # Load vocabularies
-        with open('latin_stoi.json', 'r', encoding='utf-8') as f:
-            latin_stoi = json.load(f)
-        with open('latin_itos.json', 'r', encoding='utf-8') as f:
-            latin_itos = json.load(f)
-        with open('arabic_stoi.json', 'r', encoding='utf-8') as f:
-            arabic_stoi = json.load(f)
-        with open('arabic_itos.json', 'r', encoding='utf-8') as f:
-            arabic_itos = json.load(f)
-        print(f"Latin vocab size: {len(latin_stoi)}")
-        print(f"Arabic vocab size: {len(arabic_stoi)}")
-        # Initialize model
-        model = CTCTransliterator(
-            input_dim=len(latin_stoi),
-            hidden_dim=256,
-            output_dim=len(arabic_stoi),
-            num_layers=3,
-            dropout=0.4
-        ).to(device)
-        # Load trained weights
-        model.load_state_dict(torch.load('CER_0.091_BLEU_0.85_transliterator.pth', map_location=device))
-        model.eval()
-        print("Model loaded successfully!")
-        # Find blank ID (assuming it's 0)
-        blank_id = 0
-        return model, latin_stoi, latin_itos, arabic_stoi, arabic_itos, blank_id, device
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        print("Full error details:")
-        traceback.print_exc()
-        raise
 # Load everything at startup
-try:
-    model, latin_stoi, latin_itos, arabic_stoi, arabic_itos, blank_id, device = load_model_and_vocabs()
-    cache_system = TransliterationCache()
-    print("App ready to launch!")
-except Exception as e:
-    print(f"Startup failed: {e}")
-    exit(1)
 def encode_text(text, vocab):
     """Encode text using vocabulary"""
@@ -197,14 +222,12 @@ def greedy_decode(log_probs, arabic_itos, blank_id):
     return results
 def transliterate_latin_to_arabic(text):
-    """Transliterate Latin script to Arabic script with caching"""
     if not text.strip():
         return ""
-    print(f"Processing: {text}")
-    # Check cache first
-    cached_result = cache_system.get(text, "Latin → Arabic")
     if cached_result:
         return cached_result
@@ -220,18 +243,13 @@ def transliterate_latin_to_arabic(text):
         decoded = greedy_decode(out, arabic_itos, blank_id)
         result = decoded[0] if decoded else ""
-        print(f"Generated: {result}")
-        # Cache the result
-        cache_system.set(text, "Latin → Arabic", result)
         return result
     except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        print(f"Translation failed: {error_msg}")
-        traceback.print_exc()
-        return error_msg
 def transliterate_arabic_to_latin(text):
     """Transliterate Arabic script to Latin script (placeholder)"""
@@ -245,21 +263,11 @@ def transliterate(text, direction):
         return transliterate_arabic_to_latin(text)
 def save_correction(input_text, direction, corrected_output):
-    """Save user correction to cache"""
-    if cache_system.update_correction(input_text, direction, corrected_output):
-        return "Correction saved! Thank you for improving the model."
     else:
-        return "Could not save correction."
-def get_cache_stats():
-    """Get cache statistics for development dashboard"""
-    stats = cache_system.get_stats()
-    return f"""
-     Cache Statistics:
-    • Total translations: {stats['total_translations']}
-    • Corrected translations: {stats['corrected_translations']}
-    • Most used translation: {stats['most_used_count']} times
-    """
 # Arabic keyboard layout
 arabic_keys = [
@@ -269,25 +277,29 @@ arabic_keys = [
     ['ذ', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨', '٩', '٠']
 ]
-# Create Gradio interface with development features
 def create_interface():
-    with gr.Blocks(title="Darija Transliterator - Local Dev", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
-            # Darija Transliterator (Local Development)
             Convert between Latin script and Arabic script for Moroccan Darija
-             **Local Development Mode**
-             **Smart Caching**: Results cached for faster responses
-             **Arabic Keyboard**: Built-in Arabic keyboard for corrections
-             **Debug Info**: Detailed logging in console
             """
         )
-        # Development stats
         with gr.Row():
-            stats_btn = gr.Button("Show Cache Stats", variant="secondary")
-            stats_display = gr.Textbox(label="Statistics", interactive=False, visible=False)
         with gr.Row():
             with gr.Column(scale=1):
@@ -337,7 +349,7 @@ def create_interface():
                     with gr.Row():
                         space_btn = gr.Button("Space", size="sm", scale=2)
                         backspace_btn = gr.Button("⌫ Backspace", size="sm", scale=2)
-                        clear_output_btn = gr.Button("🗑️ Clear Output", size="sm", scale=2)
                 # Correction system
                 with gr.Group():
@@ -378,7 +390,7 @@ def create_interface():
         # Stats button
         stats_btn.click(
-            fn=get_cache_stats,
             outputs=[stats_display]
         ).then(
             fn=lambda: gr.update(visible=True),
@@ -389,7 +401,7 @@ def create_interface():
         gr.Markdown("### Examples")
         examples = [
             ["kifash nta?", "Latin → Arabic"],
-            ["salam alikoum", "Latin → Arabic"],
             ["ana bem", "Latin → Arabic"],
             ["wach nta mjit?", "Latin → Arabic"],
             ["شكون نتا؟", "Arabic → Latin"],
@@ -433,20 +445,23 @@ def create_interface():
             outputs=[correction_status]
         )
-        # Add information
         gr.Markdown(
             """
-            ### Local Development Features
-            **Debug Console**: Check your terminal for detailed logs
-            **Cache Statistics**: Click "Show Cache Stats" to see usage data
-            **Hot Reload**: Restart the script to see code changes
-            **Error Details**: Full stack traces for easier debugging
-            **File Locations:**
-            - Cache: `transliteration_cache.csv`
-            - Model: `CER_0.091_BLEU_0.85_transliterator.pth`
-            - Vocabularies: `*_stoi.json` and `*_itos.json` files
             """
         )
@@ -454,15 +469,5 @@ def create_interface():
 # Launch the app
 if __name__ == "__main__":
-    print("Starting Darija Transliterator (Local Development)")
     demo = create_interface()
-    # Local development settings
-    demo.launch(
-        share=True,          # Creates public URL for sharing
-        debug=True,          # Enable debug mode
-        server_port=7860,    # Fixed port
-        server_name="0.0.0.0"  # Allow external access
-    )
-    print("Thanks for using Darija Transliterator!")

 import gradio as gr
 import torch
 import torch.nn as nn
 import json
 import os
 from datetime import datetime
 from torch.nn.utils.rnn import pad_sequence
+import firebase_admin
+from firebase_admin import credentials, firestore
+# Define the model architecture
 class CTCTransliterator(nn.Module):
     def __init__(self, input_dim, hidden_dim, output_dim, num_layers=3, dropout=0.3):
         super().__init__()
         x = x.log_softmax(dim=2)
         return x
+# Firebase Cache System
+class FirebaseCache:
+    def __init__(self):
+        self.db = None
+        self.init_firebase()
+    def init_firebase(self):
+        """Initialize Firebase connection"""
         try:
+            # Try to initialize Firebase
+            if not firebase_admin._apps:
+                # For HuggingFace Spaces, use environment variables
+                if os.getenv('FIREBASE_CREDENTIALS'):
+                    # Parse credentials from environment variable
+                    import base64
+                    cred_data = json.loads(base64.b64decode(os.getenv('FIREBASE_CREDENTIALS')).decode())
+                    cred = credentials.Certificate(cred_data)
+                elif os.path.exists('firebase-credentials.json'):
+                    # For local development
+                    cred = credentials.Certificate('firebase-credentials.json')
+                else:
+                    print("No Firebase credentials found. Using local cache fallback.")
+                    return
+                firebase_admin.initialize_app(cred)
+                self.db = firestore.client()
+                print("Firebase initialized successfully!")
+            else:
+                self.db = firestore.client()
         except Exception as e:
+            print(f"Firebase initialization failed: {e}")
+            print("Falling back to local cache mode")
+            self.db = None
+    def _create_cache_key(self, input_text, direction):
+        """Create a safe document key for Firestore"""
+        import hashlib
+        # Create hash to handle special characters and length limits
         key = f"{input_text}_{direction}"
+        return hashlib.md5(key.encode()).hexdigest()
+    def get(self, input_text, direction):
+        """Get cached translation from Firebase"""
+        if not self.db:
+            return None
+        try:
+            doc_key = self._create_cache_key(input_text, direction)
+            doc = self.db.collection('translations').document(doc_key).get()
+            if doc.exists:
+                data = doc.to_dict()
+                # Update usage count
+                self.db.collection('translations').document(doc_key).update({
+                    'usage_count': data.get('usage_count', 0) + 1,
+                    'last_used': datetime.now()
+                })
+                print(f"Cache hit: {input_text}")
+                return data.get('output', '')
+            return None
+        except Exception as e:
+            print(f"Cache read error: {e}")
+            return None
     def set(self, input_text, direction, output):
+        """Store translation in Firebase"""
+        if not self.db:
+            return False
+        try:
+            doc_key = self._create_cache_key(input_text, direction)
+            doc_data = {
+                'input': input_text,
+                'direction': direction,
+                'output': output,
+                'corrected_output': '',
+                'timestamp': datetime.now(),
+                'last_used': datetime.now(),
+                'usage_count': 1
+            }
+            self.db.collection('translations').document(doc_key).set(doc_data)
+            print(f"Cached: {input_text} → {output}")
+            return True
+        except Exception as e:
+            print(f"Cache write error: {e}")
+            return False
     def update_correction(self, input_text, direction, corrected_output):
+        """Update translation with user correction"""
+        if not self.db:
+            return False
+        try:
+            doc_key = self._create_cache_key(input_text, direction)
+            self.db.collection('translations').document(doc_key).update({
+                'corrected_output': corrected_output,
+                'correction_timestamp': datetime.now()
+            })
             print(f"Correction saved: {input_text} → {corrected_output}")
             return True
+        except Exception as e:
+            print(f"Correction save error: {e}")
+            return False
     def get_stats(self):
+        """Get cache statistics"""
+        if not self.db:
+            return "Firebase not connected"
+        try:
+            docs = self.db.collection('translations').get()
+            total = len(docs)
+            corrected = 0
+            total_usage = 0
+            for doc in docs:
+                data = doc.to_dict()
+                if data.get('corrected_output'):
+                    corrected += 1
+                total_usage += data.get('usage_count', 0)
+            return f"""
+Firebase Cache Statistics:
+• Total translations: {total}
+• With corrections: {corrected}
+• Total usage count: {total_usage}
+• Average usage: {total_usage/total if total > 0 else 0:.1f} per translation
+            """.strip()
+        except Exception as e:
+            return f"Error getting stats: {e}"
 # Load vocabularies and model
 def load_model_and_vocabs():
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Load vocabularies
+    with open('latin_stoi.json', 'r', encoding='utf-8') as f:
+        latin_stoi = json.load(f)
+    with open('latin_itos.json', 'r', encoding='utf-8') as f:
+        latin_itos = json.load(f)
+    with open('arabic_stoi.json', 'r', encoding='utf-8') as f:
+        arabic_stoi = json.load(f)
+    with open('arabic_itos.json', 'r', encoding='utf-8') as f:
+        arabic_itos = json.load(f)
+    # Initialize model
+    model = CTCTransliterator(
+        input_dim=len(latin_stoi),
+        hidden_dim=256,
+        output_dim=len(arabic_stoi),
+        num_layers=3,
+        dropout=0.4
+    ).to(device)
+    # Load trained weights
+    model.load_state_dict(torch.load('CER_0.091_BLEU_0.85_transliterator.pth', map_location=device))
+    model.eval()
+    blank_id = 0
+    return model, latin_stoi, latin_itos, arabic_stoi, arabic_itos, blank_id, device
 # Load everything at startup
+model, latin_stoi, latin_itos, arabic_stoi, arabic_itos, blank_id, device = load_model_and_vocabs()
+firebase_cache = FirebaseCache()
 def encode_text(text, vocab):
     """Encode text using vocabulary"""
     return results
 def transliterate_latin_to_arabic(text):
+    """Transliterate Latin script to Arabic script with Firebase caching"""
     if not text.strip():
         return ""
+    # Check Firebase cache first
+    cached_result = firebase_cache.get(text, "Latin → Arabic")
     if cached_result:
         return cached_result
         decoded = greedy_decode(out, arabic_itos, blank_id)
         result = decoded[0] if decoded else ""
+        # Cache the result in Firebase
+        firebase_cache.set(text, "Latin → Arabic", result)
         return result
     except Exception as e:
+        return f"Error: {str(e)}"
 def transliterate_arabic_to_latin(text):
     """Transliterate Arabic script to Latin script (placeholder)"""
         return transliterate_arabic_to_latin(text)
 def save_correction(input_text, direction, corrected_output):
+    """Save user correction to Firebase"""
+    if firebase_cache.update_correction(input_text, direction, corrected_output):
+        return "Correction saved to Firebase! Thank you for improving the model."
     else:
+        return "Could not save correction to Firebase."
 # Arabic keyboard layout
 arabic_keys = [
     ['ذ', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨', '٩', '٠']
 ]
+# Create Gradio interface
 def create_interface():
+    with gr.Blocks(title="Darija Transliterator", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
+            # Darija Transliterator
             Convert between Latin script and Arabic script for Moroccan Darija
+            **Firebase-Powered**: Persistent caching across sessions
+            **Arabic Keyboard**: Built-in Arabic keyboard for corrections
+            **Real-time Stats**: Live usage analytics
             """
         )
+        # Stats section
         with gr.Row():
+            stats_btn = gr.Button("Show Statistics", variant="secondary")
+            stats_display = gr.Textbox(
+                label="Firebase Statistics",
+                interactive=False,
+                visible=False,
+                lines=5
+            )
         with gr.Row():
             with gr.Column(scale=1):
                     with gr.Row():
                         space_btn = gr.Button("Space", size="sm", scale=2)
                         backspace_btn = gr.Button("⌫ Backspace", size="sm", scale=2)
+                        clear_output_btn = gr.Button("Clear Output", size="sm", scale=2)
                 # Correction system
                 with gr.Group():
         # Stats button
         stats_btn.click(
+            fn=firebase_cache.get_stats,
             outputs=[stats_display]
         ).then(
             fn=lambda: gr.update(visible=True),
         gr.Markdown("### Examples")
         examples = [
             ["kifash nta?", "Latin → Arabic"],
+            ["salam alikoum", "Latin → Arabic"],
             ["ana bem", "Latin → Arabic"],
             ["wach nta mjit?", "Latin → Arabic"],
             ["شكون نتا؟", "Arabic → Latin"],
             outputs=[correction_status]
         )
+        # Information
         gr.Markdown(
             """
+            ### About
+            This model transliterates Moroccan Darija between Latin and Arabic scripts using a CTC-based neural network.
+            **Firebase Features:**
+            - **Persistent Storage**: All translations are saved permanently
+            - **Analytics**: Track usage patterns and popular translations
+            - **Fast Responses**: Cached results load instantly
+            - **Global Access**: Data synced across all users
+            - **Corrections**: Help improve the model by fixing outputs
+            **How to help improve the model:**
+            1. Use the Arabic keyboard to correct any wrong translations
+            2. Click "Save Correction" to store your improvement
+            3. Your corrections help train better models for everyone!
             """
         )
 # Launch the app
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch(share=True)