Spaces:

uzagi
/

speak-smart

Sleeping

App Files Files Community

uzagi commited on Mar 16

Commit

cb3e494

1 Parent(s): ff685cc

add phoneme

Browse files

Files changed (6) hide show

Dockerfile +1 -1
__pycache__/app.cpython-312.pyc +0 -0
app.py +7 -1
model.py +13 -0
phoneme.py +152 -0
requirements.txt +6 -1

Dockerfile CHANGED Viewed

@@ -10,4 +10,4 @@ COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--reload"]

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (393 Bytes). View file

app.py CHANGED Viewed

@@ -1,7 +1,13 @@
 from fastapi import FastAPI
 app = FastAPI()
 @app.get("/")
 def greet_json():
-    return {"Hello": "World!"}

 from fastapi import FastAPI
+from phoneme import test_sound
 app = FastAPI()
 @app.get("/")
 def greet_json():
+    return {"Hello": "World!", "Eat": "Cat"}
+@app.post("/phoneme-scoring")
+def scoring(input_text, audio):
+    test_sound()

model.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic import BaseModel
+class ResponseData(BaseModel):
+    text: str
+class PhonemeRequest(BaseModel):
+    transcript: str
+    audio: str
+class PhonemeResponse(BaseModel):
+    code: int
+    message: str
+    data: {}

phoneme.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import time
+import torch
+from transformers import AutoProcessor, AutoModelForCTC, Wav2Vec2PhonemeCTCTokenizer
+import librosa
+from itertools import groupby
+from datasets import load_dataset
+from phonemizer import phonemize
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+# PHONEMIZER_ESPEAK_LIBRARY="c:\Program Files\eSpeak NG\libespeak-ng.dll"
+# PHONEMIZER_ESPEAK_PATH="c:\Program Files\eSpeak NG"
+# ESPEAK_PATH = os.getenv("PHONEMIZER_ESPEAK_LIBRARY")
+# if ESPEAK_PATH is not None:
+#     EspeakWrapper.set_library(ESPEAK_PATH)
+# print(f"Loaded environment variables PHONEMIZER_ESPEAK_LIBRARY: {ESPEAK_PATH}")
+# print(f"Using espeak library: {EspeakWrapper.library_path}")
+# Load the model and processor
+# checkpoint = "bookbot/wav2vec2-ljspeech-gruut"
+checkpoint = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+model = AutoModelForCTC.from_pretrained(checkpoint)
+processor = AutoProcessor.from_pretrained(checkpoint)
+tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained(checkpoint)
+sr = processor.feature_extractor.sampling_rate
+def decode_phonemes(
+    ids: torch.Tensor, processor: AutoProcessor, ignore_stress: bool = False
+) -> str:
+    """CTC-like decoding. First removes consecutive duplicates, then removes special tokens."""
+    # Remove consecutive duplicates
+    ids = [id_ for id_, _ in groupby(ids)]
+    special_token_ids = processor.tokenizer.all_special_ids + [
+        processor.tokenizer.word_delimiter_token_id
+    ]
+    # Convert id to token, skipping special tokens
+    phonemes = [processor.decode(id_) for id_ in ids if id_ not in special_token_ids]
+    # Join phonemes
+    prediction = " ".join(phonemes)
+    # Ignore IPA stress marks if specified
+    if ignore_stress:
+        prediction = prediction.replace("ˈ", "").replace("ˌ", "")
+    return prediction
+def text_to_phonemes(text: str) -> str:
+    s_time = time.time()
+    """Convert text to phonemes using phonemizer."""
+    # phonemes = phonemize(text, language="en-us", backend="espeak", strip=True)
+    phonemes = tokenizer.phonemize(text, phonemizer_lang="en-us")
+    e_time = time.time()
+    print(f"Execution time of text_to_phonemes: {e_time - s_time:.6f} seconds")
+    return phonemes
+def text_to_phonemes_2(text: str) -> str:
+    s_time = time.time()
+    """Convert text to phonemes using phonemizer."""
+    phonemes = phonemize(text, language="en-us", backend="espeak", strip=True)
+    # phonemes = tokenizer.phonemize(text)
+    e_time = time.time()
+    print(f"Execution time of text_to_phonemes_2: {e_time - s_time:.6f} seconds")
+    return phonemes
+def separate_characters(input_string):
+    no_spaces = input_string.replace(" ", "")
+    spaced_string = " ".join(no_spaces)
+    return spaced_string
+def predict_phonemes(audio_array):
+    # Load audio file and preprocess
+    # audio_array, _ = librosa.load(audio_path, sr=sr)
+    inputs = processor(audio_array, return_tensors="pt", padding=True)
+    # Perform inference
+    with torch.no_grad():
+        logits = model(inputs["input_values"]).logits
+    # Decode the predicted phonemes
+    predicted_ids = torch.argmax(logits, dim=-1)
+    predicted_phonemes = decode_phonemes(
+        predicted_ids[0], processor, ignore_stress=True
+    )
+    return predicted_phonemes  # Return the predicted phonemes
+def adjust_phonemes(predicted: str) -> str:
+    # Replace specific phonemes or patterns as needed
+    # adjusted = predicted.replace(" ə ", " ")  # Remove schwa if it appears alone
+    adjusted = predicted.replace("  ", " ")  # Remove double spaces
+    adjusted = adjusted.strip()  # Trim leading/trailing spaces
+    return adjusted
+def calculate_score(expected: str, predicted: str) -> float:
+    expected_list = expected.split()
+    predicted_list = predicted.split()
+    # Calculate the number of correct matches
+    correct_matches = sum(1 for e, p in zip(expected_list, predicted_list) if e == p)
+    # Calculate the score as the ratio of correct matches to expected phonemes
+    score = correct_matches / len(expected_list) if expected_list else 0
+    return score
+def test_sound():
+    start_time = time.time()
+    ds = load_dataset(
+        "patrickvonplaten/librispeech_asr_dummy",
+        "clean",
+        split="validation",
+        trust_remote_code=True,
+    )
+    audio_array = ds[0]["audio"]["array"]
+    text = ds[0]["text"]
+    # audio_path = "hello.wav"
+    # text = "Hello"
+    expected_transcript = text  # Expected transcript
+    expected_phonemes = text_to_phonemes(text)  # Expected phonemes for "Hello"
+    expected_phonemes = separate_characters(expected_phonemes)
+    # Call the phoneme prediction function
+    predicted_phonemes = predict_phonemes(audio_array)
+    adjusted_phonemes = adjust_phonemes(predicted_phonemes)
+    # expected_phonemes_2 = text_to_phonemes_2(expected_transcript)
+    print(f"Expected Phonemes: {expected_phonemes}")
+    # print(f"Expected Phonemes 2: {expected_phonemes_2}")
+    print(f"Predicted Phonemes: {predicted_phonemes}")
+    print(f"Adjusted Phonemes: {adjusted_phonemes}")
+    # Calculate score based on expected and predicted phonemes
+    score = calculate_score(expected_phonemes, adjusted_phonemes)
+    # Prepare the output
+    text = f"Transcript: {expected_transcript}\nExpected Phonemes: {expected_phonemes}\nPredicted Phonemes: {predicted_phonemes}\nAdjusted Phonemes: {adjusted_phonemes}\nScore: {score:.2f}"
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution time: {execution_time:.6f} seconds")
+    return {"text": text}

requirements.txt CHANGED Viewed

@@ -1,2 +1,7 @@
 fastapi
-uvicorn[standard]

 fastapi
+uvicorn[standard]
+torch
+transformers
+librosa
+phonemizer
+datasets