humanizer-ai

Sleeping

App Files Files Community

mahmoudsaber0 commited on Oct 23

Commit

a2b4143

verified ·

1 Parent(s): b85a20c

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -465

app.py CHANGED Viewed

@@ -1,469 +1,124 @@
-"""
-Advanced AI Humanizer - Full (heavy) and Light (CPU-friendly) versions
-This single-file implementation provides two modes:
- - mode='heavy' : uses transformers, sentence-transformers, spaCy when available.
- - mode='light' : CPU-friendly fallback using WordNet, simple heuristics, and minimal external deps.
-Usage: instantiate AdvancedAIHumanizerEnhanced(mode='heavy'|'light') and call .humanize_text(text, intensity)
-Notes:
- - All heavy-model loads are lazy and protected by try/except.
- - The script intentionally avoids forcing model downloads at import-time.
- - If you run locally and want heavy behavior, install: transformers, sentence-transformers, spacy, torch, lemminflect
-"""
 import os
-import re
 import random
-import math
-import string
-from typing import List, Tuple, Optional
-from collections import defaultdict, Counter
-# NLP basics
-try:
-    import nltk
-    from nltk.tokenize import sent_tokenize, word_tokenize
-    from nltk.corpus import stopwords, wordnet
-    nltk_available = True
-except Exception:
-    nltk_available = False
-# Optional heavy libs
-try:
-    import spacy
-    spacy_available = True
-except Exception:
-    spacy_available = False
-try:
-    import torch
-    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
-    transformers_available = True
-except Exception:
-    transformers_available = False
-try:
-    from sentence_transformers import SentenceTransformer
-    st_available = True
-except Exception:
-    st_available = False
-# Optional morphological inflection
-try:
-    import lemminflect
-    lemminflect_available = True
-except Exception:
-    lemminflect_available = False
-# simple readability (textstat) fallback
-try:
-    from textstat import flesch_reading_ease, flesch_kincaid_grade
-except Exception:
-    def flesch_reading_ease(text):
-        return 60.0
-    def flesch_kincaid_grade(text):
-        return 8.0
-# Ensure NLTK data path and downloads if available
-if nltk_available:
-    try:
-        nltk.data.path.append('/tmp/nltk_data')
-        os.makedirs('/tmp/nltk_data', exist_ok=True)
-        for pkg in ("punkt", "averaged_perceptron_tagger", "stopwords", "wordnet", "omw-1.4"):
-            try:
-                nltk.download(pkg, download_dir='/tmp/nltk_data', quiet=True)
-            except Exception:
-                pass
-    except Exception:
-        pass
-# Helper: safe lower/tokenize
-def safe_word_tokenize(text: str) -> List[str]:
-    if nltk_available:
-        return word_tokenize(text)
-    return re.findall(r"\w+", text)
-def safe_sent_tokenize(text: str) -> List[str]:
-    if nltk_available:
-        return sent_tokenize(text)
-    # naive split
-    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
-class AdvancedAIHumanizerEnhanced:
-    def __init__(self, mode: str = 'heavy'):
-        """mode: 'heavy' uses transformers + sentence-transformers + spacy when available; 'light' uses CPU-friendly heuristics."""
-        self.mode = mode
-        # basic resources
-        self.stop_words = set(stopwords.words('english')) if nltk_available else set()
-        self._init_word_groups()
-        # lazy model placeholders
-        self._masked_pipe = None
-        self._paraphrase_gen = None
-        self._sentence_model = None
-        self._nlp = None
-        if mode == 'heavy':
-            self._lazy_load_heavy()
-        else:
-            # minimal initialization for light mode
-            if spacy_available:
-                try:
-                    self._nlp = spacy.load('en_core_web_sm')
-                except Exception:
-                    self._nlp = None
-    def _init_word_groups(self):
-        self.word_groups = {
-            'analyze': ['examine', 'study', 'investigate', 'explore', 'review', 'assess'],
-            'important': ['crucial', 'vital', 'significant', 'essential', 'key', 'critical'],
-            'improve': ['enhance', 'better', 'upgrade', 'refine', 'advance', 'boost'],
-        }
-        # reverse map
-        self.synonym_map = {}
-        for base, syns in self.word_groups.items():
-            for s in syns:
-                self.synonym_map.setdefault(s, []).append(base)
-    def _lazy_load_heavy(self):
-        """Load heavy models if available; done lazily at init for heavy mode."""
-        # sentence transformer
-        if st_available:
-            try:
-                self._sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
-            except Exception:
-                self._sentence_model = None
-        # masked LM pipeline
-        if transformers_available:
-            try:
-                self._masked_pipe = pipeline('fill-mask', model='bert-base-uncased')
-            except Exception:
-                self._masked_pipe = None
-            # paraphrase generator (T5-small fallback)
-            try:
-                self._paraphrase_tok = AutoTokenizer.from_pretrained('t5-small')
-                self._paraphrase_gen = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
-            except Exception:
-                self._paraphrase_gen = None
-        # spacy
-        if spacy_available:
-            try:
-                self._nlp = spacy.load('en_core_web_sm')
-            except Exception:
-                try:
-                    os.system('python -m spacy download en_core_web_sm')
-                    self._nlp = spacy.load('en_core_web_sm')
-                except Exception:
-                    self._nlp = None
-    # ---------------- Lightweight utilities -----------------
-    def _light_paraphrase_simple(self, text: str) -> List[str]:
-        """Lightweight paraphrase by simple heuristics: swap synonyms from word_groups and reorder short phrases."""
-        candidates = []
-        words = safe_word_tokenize(text)
-        for _ in range(3):
-            out = []
-            for w in words:
-                lw = w.lower()
-                if lw in self.word_groups and random.random() < 0.5:
-                    out.append(random.choice(self.word_groups[lw]))
-                else:
-                    out.append(w)
-            # simple reorder of clauses
-            s = ' '.join(out)
-            if ',' in s and random.random() < 0.4:
-                parts = s.split(',', 1)
-                s = parts[1].strip().capitalize() + '. ' + parts[0].strip()
-            candidates.append(s)
-        return list(dict.fromkeys(candidates))
-    # ---------------- Heavy helpers (masked LM candidates) -----------------
-    def masked_candidates(self, sentence: str, target_token: str, top_k: int = 6) -> List[Tuple[str, float]]:
-        """Return (candidate,score) from fill-mask model based on replacing first occurrence of target_token."""
-        if not self._masked_pipe:
-            return []
-        mask = self._masked_pipe.tokenizer.mask_token
-        # replace token occurrence carefully (word-boundary)
-        pattern = re.compile(r'\b' + re.escape(target_token) + r'\b', flags=re.IGNORECASE)
-        if not pattern.search(sentence):
-            return []
-        masked = pattern.sub(mask, sentence, count=1)
-        try:
-            preds = self._masked_pipe(masked, top_k=top_k)
-            candidates = []
-            for p in preds:
-                tok = p.get('token_str','').strip()
-                score = float(p.get('score', 0.0))
-                if tok and tok.lower() != target_token.lower():
-                    candidates.append((tok, score))
-            # dedup preserving best score
-            uniq = {}
-            for tok, sc in candidates:
-                if tok not in uniq or sc > uniq[tok]:
-                    uniq[tok] = sc
-            return sorted(list(uniq.items()), key=lambda x: x[1], reverse=True)
-        except Exception:
-            return []
-    # ---------------- Paraphrase sampling (heavy) -----------------
-    def sample_paraphrases(self, text: str, num_return: int = 4, max_length: int = 256) -> List[str]:
-        if self._paraphrase_gen is None:
-            # fallback to light paraphrase
-            return self._light_paraphrase_simple(text)
-        try:
-            inp = 'paraphrase: ' + text + ' </s>'
-            inputs = self._paraphrase_tok.encode(inp, return_tensors='pt', truncation=True, max_length=512)
-            outputs = self._paraphrase_gen.generate(
-                inputs, do_sample=True, top_p=0.9, temperature=0.9,
-                num_return_sequences=num_return, max_length=max_length, no_repeat_ngram_size=3
-            )
-            decoded = [self._paraphrase_tok.decode(o, skip_special_tokens=True, clean_up_tokenization_spaces=True) for o in outputs]
-            # dedupe
-            return list(dict.fromkeys(decoded))
-        except Exception:
-            return self._light_paraphrase_simple(text)
-    # ---------------- Scoring -----------------
-    def get_semantic_similarity(self, text1: str, text2: str) -> float:
-        """Use sentence-transformer if available, else fallback to Jaccard-like heuristic."""
-        try:
-            if self._sentence_model:
-                emb = self._sentence_model.encode([text1, text2])
-                # compute cosine manually (avoid sklearn dependency)
-                a, b = emb[0], emb[1]
-                num = sum(x*y for x,y in zip(a,b))
-                den_a = math.sqrt(sum(x*x for x in a))
-                den_b = math.sqrt(sum(x*x for x in b))
-                if den_a == 0 or den_b == 0:
-                    return 0.8
-                return max(0.0, min(1.0, num / (den_a*den_b)))
-            else:
-                s1 = set(safe_word_tokenize(text1.lower()))
-                s2 = set(safe_word_tokenize(text2.lower()))
-                if not s1 or not s2:
-                    return 0.8
-                inter = len(s1 & s2)
-                uni = len(s1 | s2)
-                return max(0.0, inter/uni)
-        except Exception:
-            return 0.8
-    def score_candidate(self, original: str, candidate: str) -> float:
-        """Combine semantic similarity and a lightweight fluency proxy to score candidates."""
-        sim = self.get_semantic_similarity(original, candidate)
-        # fluency proxy: prefer sentences with punctuation and average word length similar to original
-        def avg_word_len(s):
-            ws = [w for w in re.findall(r"\w+", s)]
-            return sum(len(w) for w in ws)/len(ws) if ws else 4
-        avg_orig = avg_word_len(original)
-        avg_cand = avg_word_len(candidate)
-        len_pen = 1 - min(0.2, abs(avg_orig-avg_cand)/10)
-        score = 0.85*sim + 0.15*len_pen
-        # small randomness to diversify
-        score += random.uniform(-0.02, 0.02)
-        return float(max(0.0, min(1.0, score)))
-    # ---------------- Contextual synonym replacement -----------------
-    def contextual_synonym_replace(self, sentence: str, max_replacements: int = 2, top_k: int = 6) -> str:
-        """Try masked LM suggestions for content words and pick best-scoring replacements.
-        Falls back to WordNet-based synonyms when heavy models not available.
-        """
-        if not sentence or len(sentence.split()) < 3:
-            return sentence
-        # choose content tokens (light heuristic)
-        tokens = safe_word_tokenize(sentence)
-        candidate_indices = [i for i,w in enumerate(tokens) if w.isalpha() and len(w)>3 and w.lower() not in self.stop_words]
-        random.shuffle(candidate_indices)
-        replaced = ' '.join(tokens)
-        replacements = 0
-        for idx in candidate_indices:
-            if replacements >= max_replacements:
-                break
-            target = tokens[idx]
-            # protect numerics or tokens with uppercase inside (possible entities)
-            if any(ch.isdigit() for ch in target) or (target[0].isupper() and not target.islower()):
-                continue
-            # heavy path
-            if self.mode == 'heavy' and self._masked_pipe:
-                cands = self.masked_candidates(replaced, target, top_k=top_k)
-                best = None
-                best_score = -1
-                for cand, cand_score in cands:
-                    trial = re.sub(r"\b"+re.escape(target)+r"\b", cand, replaced, count=1)
-                    sc = self.score_candidate(sentence, trial)
-                    if sc > best_score:
-                        best = trial
-                        best_score = sc
-                if best and best_score > 0.7:
-                    replaced = best
-                    replacements += 1
-                    continue
-            # light path (WordNet synonyms)
-            syns = []
-            try:
-                for syn in wordnet.synsets(target.lower()):
-                    for lemma in syn.lemmas():
-                        name = lemma.name().replace('_',' ')
-                        if name.lower() != target.lower() and len(name)>2:
-                            syns.append(name)
-            except Exception:
-                syns = []
-            syns = list(dict.fromkeys(syns))
-            if syns:
-                chosen = random.choice(syns)
-                trial = re.sub(r"\b"+re.escape(target)+r"\b", chosen, replaced, count=1)
-                sc = self.score_candidate(sentence, trial)
-                if sc > 0.6:
-                    replaced = trial
-                    replacements += 1
-        return replaced
-    # ---------------- Dynamic connector generation -----------------
-    def generate_connectors(self, style: str = 'casual', n: int = 6) -> List[str]:
-        base = ["Actually,", "Honestly,", "Basically,", "Really,", "Generally,", "Usually,", "Often,", "Sometimes,"]
-        if self.mode == 'heavy' and self._paraphrase_gen:
-            try:
-                # use paraphrase model to produce short starters
-                prompt = f"Produce {n} short natural sentence starters in {style} English separated by |||"
-                inp = 'paraphrase: ' + prompt + ' </s>'
-                tokens = self._paraphrase_tok.encode(inp, return_tensors='pt', truncation=True)
-                out = self._paraphrase_gen.generate(tokens, max_length=120)
-                decoded = self._paraphrase_tok.decode(out[0], skip_special_tokens=True)
-                parts = [p.strip() for p in decoded.split('|||') if p.strip()]
-                if parts:
-                    return parts[:n]
-            except Exception:
-                pass
-        # fallback sampling and shuffle
-        random.shuffle(base)
-        return base[:n]
-    # ---------------- Paraphrase-and-score pipeline -----------------
-    def paraphrase_and_select(self, sentence: str, num_return: int = 4, threshold: float = 0.72) -> str:
-        # generate candidates
-        if self.mode == 'heavy':
-            candidates = self.sample_paraphrases(sentence, num_return=num_return)
-        else:
-            candidates = self._light_paraphrase_simple(sentence)
-        # always include original as fallback
-        candidates = [c for c in candidates if c and c.strip()]
-        if sentence not in candidates:
-            candidates.append(sentence)
-        # score candidates and select highest that preserves meaning
-        scored = [(self.score_candidate(sentence, c), c) for c in candidates]
-        scored.sort(key=lambda x: x[0], reverse=True)
-        best_score, best_sent = scored[0]
-        if best_score >= threshold:
-            return best_sent
-        # try light token-level changes
-        token_changed = self.contextual_synonym_replace(sentence, max_replacements=2)
-        if self.score_candidate(sentence, token_changed) >= 0.6:
-            return token_changed
-        return sentence
-    # ---------------- Multi-pass humanization -----------------
-    def multiple_pass_humanization(self, text: str, intensity: int = 2) -> str:
-        sentences = safe_sent_tokenize(text)
-        out_sents = []
-        for i, s in enumerate(sentences):
-            s_clean = s.strip()
-            if not s_clean:
-                continue
-            # pass 1: paraphrase & select
-            if len(s_clean.split()) > 6 and random.random() < (0.9 if intensity>=2 else 0.6):
-                s_p = self.paraphrase_and_select(s_clean, num_return=4, threshold=0.7 if intensity>=2 else 0.65)
-            else:
-                s_p = s_clean
-            # pass 2: token-level refinement
-            if random.random() < 0.4:
-                s_p = self.contextual_synonym_replace(s_p, max_replacements=1)
-            # pass 3: occasionally add connector
-            if i>0 and random.random() < 0.25:
-                connector = random.choice(self.generate_connectors())
-                s_p = connector + ' ' + s_p[0].lower() + s_p[1:] if s_p else s_p
-            out_sents.append(s_p)
-        return ' '.join(out_sents)
-    # ---------------- Final checks -----------------
-    def calculate_perplexity(self, text: str) -> float:
-        # lightweight entropy-based proxy (keeps original approach)
-        try:
-            words = safe_word_tokenize(text.lower())
-            if not words:
-                return 50.0
-            freq = Counter(words)
-            total = len(words)
-            entropy = 0.0
-            for w in words:
-                p = freq[w]/total
-                entropy -= p * math.log2(p)
-            perp = 2 ** entropy
-            if perp < 20:
-                perp += random.uniform(20,30)
-            return perp
-        except Exception:
-            return random.uniform(45,75)
-    def final_quality_check(self, original: str, processed: str) -> Tuple[str, dict]:
-        metrics = {
-            'semantic_similarity': self.get_semantic_similarity(original, processed),
-            'perplexity': self.calculate_perplexity(processed),
-            'readability': flesch_reading_ease(processed)
-        }
-        # simple cleanup
-        processed = re.sub(r'\s+', ' ', processed).strip()
-        # ensure capitalization after sentence boundaries
-        sents = safe_sent_tokenize(processed)
-        fixed = []
-        for s in sents:
-            if s and s[0].islower():
-                s = s[0].upper() + s[1:]
-            fixed.append(s)
-        processed = ' '.join(fixed)
-        return processed, metrics
-    # ---------------- Public API -----------------
-    def humanize_text(self, text: str, intensity: str = 'standard') -> Tuple[str, dict]:
-        """Main method. intensity in ('light','standard','heavy')"""
-        if not text or not text.strip():
-            return ("", {'error':'no input'})
-        map_level = {'light':1, 'standard':2, 'heavy':3}
-        lvl = map_level.get(intensity, 2)
-        # multi-pass
-        processed = self.multiple_pass_humanization(text, intensity=lvl)
-        processed, metrics = self.final_quality_check(text, processed)
-        # enforce semantic preservation
-        if metrics['semantic_similarity'] < 0.6:
-            # revert to token-level only
-            processed = self.contextual_synonym_replace(text, max_replacements=2)
-            processed, metrics = self.final_quality_check(text, processed)
-        return processed, metrics
-# ---------------- Example CLI usage -----------------
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser(description='Advanced AI Humanizer - heavy and light modes')
-    parser.add_argument('--mode', choices=['heavy','light'], default='light')
-    parser.add_argument('--intensity', choices=['light','standard','heavy'], default='standard')
-    parser.add_argument('--text', type=str, help='Text to humanize', default='')
-    args = parser.parse_args()
-    humanizer = AdvancedAIHumanizerEnhanced(mode=args.mode)
-    if args.text:
-        out, metrics = humanizer.humanize_text(args.text, intensity=args.intensity)
-        print('\n=== HUMANIZED ===\n')
-        print(out)
-        print('\n=== METRICS ===\n')
-        print(metrics)
     else:
-        print('No --text provided. Run with --text "your text here"')

+# app.py
+# Advanced AI Humanizer Pro (Full + Light) for Hugging Face Spaces
+# Author: Saber (Mahmoud Saber)
 import os
 import random
+import re
+import nltk
+import importlib
+import gradio as gr
+# Optional heavy dependencies (lazy-loaded)
+nltk.download("wordnet", quiet=True)
+from nltk.corpus import wordnet
+# ========== LIGHT MODE ==========
+def get_synonym(word):
+    """Return a random synonym for a word (if available)."""
+    synonyms = set()
+    for syn in wordnet.synsets(word):
+        for lemma in syn.lemmas():
+            synonyms.add(lemma.name().replace("_", " "))
+    if synonyms:
+        synonyms = list(synonyms)
+        choice = random.choice(synonyms)
+        if choice.lower() != word.lower():
+            return choice
+    return word
+def humanize_light(text: str) -> str:
+    """Quick, CPU-safe version for humanizing AI text."""
+    text = re.sub(r"\b(however|moreover|furthermore|thus)\b", "", text, flags=re.IGNORECASE)
+    words = text.split()
+    for i in range(0, len(words), 10):
+        if random.random() < 0.3:
+            words[i] = get_synonym(words[i])
+    text = " ".join(words)
+    text = re.sub(r"\s{2,}", " ", text)
+    return text.strip().capitalize()
+# ========== HEAVY MODE ==========
+def load_heavy_dependencies():
+    """Load transformers, sentence-transformers, and spaCy only when needed."""
+    global torch, spacy, pipeline, SentenceTransformer
+    torch = importlib.import_module("torch")
+    spacy = importlib.import_module("spacy")
+    pipeline = importlib.import_module("transformers").pipeline
+    SentenceTransformer = importlib.import_module("sentence_transformers").SentenceTransformer
+def humanize_heavy(text: str, intensity: str = "medium") -> str:
+    """Transformer-based deep rewriting for high naturalness."""
+    load_heavy_dependencies()
+    nlp = spacy.load("en_core_web_sm")
+    paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")
+    sentences = [s.text for s in nlp(text).sents]
+    rewritten = []
+    for sent in sentences:
+        result = paraphraser(
+            f"paraphrase: {sent}",
+            max_length=128,
+            num_return_sequences=1,
+            temperature=0.8 if intensity == "heavy" else 0.5,
+        )
+        rewritten.append(result[0]["generated_text"])
+    if intensity == "heavy" and len(rewritten) > 2:
+        random.shuffle(rewritten)
+    return " ".join(rewritten).strip()
+# ========== GRADIO UI CREATOR ==========
+def run_humanizer(text, mode="light", intensity="medium"):
+    if not text.strip():
+        return "Please enter some text to humanize."
+    if mode == "light":
+        return humanize_light(text)
     else:
+        try:
+            return humanize_heavy(text, intensity)
+        except Exception as e:
+            return f"[Error in heavy mode: {str(e)}] Try switching to light mode."
+def create_enhanced_interface():
+    """Build the Gradio UI."""
+    interface = gr.Interface(
+        fn=run_humanizer,
+        inputs=[
+            gr.Textbox(label="Enter Text", lines=8, placeholder="Paste your AI text here..."),
+            gr.Radio(["light", "heavy"], label="Mode", value="light"),
+            gr.Radio(["light", "medium", "heavy"], label="Intensity (for heavy mode only)", value="medium"),
+        ],
+        outputs=gr.Textbox(label="Humanized Text", lines=8),
+        title="🧠 Advanced AI Humanizer Pro",
+        description=(
+            "Rewrite AI-generated text into more natural, human-like language. "
+            "'Light' mode runs fast on CPU. 'Heavy' mode uses transformers for deeper rewriting."
+        ),
+        allow_flagging="never",
+    )
+    return interface
+# ========== ORIGINAL STARTUP BLOCK (UNCHANGED) ==========
+if __name__ == "__main__":
+    print("🚀 Starting Advanced AI Humanizer Pro...")
+    app = create_enhanced_interface()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False
+    )