|
|
""" |
|
|
Customized phonemization wrapper. |
|
|
Original wrapper by Lingjuan Zhu (@lingjzhu): |
|
|
https://github.com/lingjzhu/text2phonemesequence |
|
|
|
|
|
""" |
|
|
import csv |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import subprocess |
|
|
import warnings |
|
|
|
|
|
import segments |
|
|
import torch |
|
|
import transformers |
|
|
|
|
|
|
|
|
class Grapheme2Phoneme: |
|
|
def __init__( |
|
|
self, |
|
|
words_to_exclude=None, |
|
|
tokenizer="google/byt5-small", |
|
|
language="fr", |
|
|
cuda=True, |
|
|
folder_language="lang_dict", |
|
|
): |
|
|
""" |
|
|
Load the rules to convert from a language to the corresponding phonemes. |
|
|
|
|
|
:param list words_to_exclude: List of words to exclude from phonemization. |
|
|
:param str tokenizer: Pre-trained tokenizer model name. |
|
|
:param str language: Language code for phonemization. Two letters only (e.g.: "fr") |
|
|
:param bool cuda: Flag to use CUDA for GPU acceleration. |
|
|
:param str folder_language: Folder path to save language dictionaries. |
|
|
""" |
|
|
self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer) |
|
|
self.model = transformers.T5ForConditionalGeneration.from_pretrained( |
|
|
"charsiu/g2p_multilingual_byT5_small_100" |
|
|
) |
|
|
device = "cpu" |
|
|
if cuda: |
|
|
if torch.cuda.is_available(): |
|
|
device = "cuda:0" |
|
|
else: |
|
|
warnings.warn("CUDA is not available but was requested") |
|
|
self.device = torch.device(device) |
|
|
self.model = self.model.to(self.device) |
|
|
if words_to_exclude is None: |
|
|
words_to_exclude = ["[UNK]"] |
|
|
self.exclude_token = words_to_exclude |
|
|
self.segment_tool = segments.Tokenizer() |
|
|
self.language = { |
|
|
"fr": "fra", |
|
|
"it": "ita" |
|
|
}[language] |
|
|
self.phone_dict = {} |
|
|
|
|
|
os.makedirs(folder_language, exist_ok=True) |
|
|
language_path = os.path.join(folder_language, self.language + ".tsv") |
|
|
if not os.path.exists(language_path): |
|
|
subprocess.run( |
|
|
[ |
|
|
"wget", |
|
|
"-O", |
|
|
language_path, |
|
|
f"https://raw.githubusercontent.com/lingjzhu/CharsiuG2P/main/dicts/{self.language}.tsv" |
|
|
], |
|
|
check=True |
|
|
) |
|
|
|
|
|
config_path = "configs/phoneme_lengths.json" |
|
|
if os.path.exists(config_path): |
|
|
with open(config_path, "r", encoding="utf-8") as file: |
|
|
self.phoneme_lengths = json.load(file) |
|
|
else: |
|
|
warnings.warn("Loading dummy values for phonemes lengths!") |
|
|
self.phoneme_lengths = { |
|
|
"fra.tsv": 50, |
|
|
"ita.tsv": 50 |
|
|
} |
|
|
if os.path.exists(language_path): |
|
|
with open(language_path, "r", encoding="utf-8") as file: |
|
|
tsv_file = csv.reader(file, delimiter="\t") |
|
|
|
|
|
for word_phonemes in tsv_file: |
|
|
assert len(word_phonemes) == 2 |
|
|
self.phone_dict[word_phonemes[0]] = [word_phonemes[1].split(",")[0]] |
|
|
|
|
|
def phonemize(self, words): |
|
|
""" |
|
|
Convert text to phonemes using the T5 model. |
|
|
|
|
|
:param list[str] words: Input text to be converted. |
|
|
:return list[str]: Phonemized words. |
|
|
""" |
|
|
phonemes_list = [""] * len(words) |
|
|
new_words = [] |
|
|
for i, word in enumerate(words): |
|
|
|
|
|
word = re.sub(r"(?<!\s)\[PAD](?!\s)", r"<pad>", word) |
|
|
word = re.sub(r"(?<!\s)\[UNK](?!\s)", r"<unk>", word) |
|
|
|
|
|
word = re.sub(r" +", " ", word).strip() |
|
|
if word in self.phone_dict: |
|
|
phonemes_list[i] = self.phone_dict[word][0] |
|
|
elif word in self.exclude_token: |
|
|
phonemes_list[i] = word |
|
|
else: |
|
|
new_words.append((i, word)) |
|
|
|
|
|
|
|
|
if new_words: |
|
|
out = self.tokenizer( |
|
|
[f"<{self.language}>: {word[1]}" for word in new_words], |
|
|
padding=True, |
|
|
add_special_tokens=False, |
|
|
return_tensors="pt", |
|
|
) |
|
|
predictions = self.model.generate( |
|
|
input_ids=out["input_ids"].to(self.device), |
|
|
attention_mask=out["attention_mask"].to(self.device), |
|
|
num_beams=1, |
|
|
max_length=self.phoneme_lengths[self.language + ".tsv"], |
|
|
) |
|
|
phonemes = self.tokenizer.batch_decode( |
|
|
predictions, skip_special_tokens=True |
|
|
) |
|
|
for i, phonemized in enumerate(phonemes): |
|
|
phonemes_list[new_words[i][0]] = phonemized |
|
|
|
|
|
for i, phoneme in enumerate(phonemes_list): |
|
|
|
|
|
if phoneme not in self.exclude_token: |
|
|
try: |
|
|
phonemes_list[i] = self.segment_tool(phoneme, ipa=True) |
|
|
except IndexError: |
|
|
|
|
|
phonemes_list[i] = self.segment_tool(phoneme) |
|
|
|
|
|
|
|
|
return phonemes_list |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
phonemizer = Grapheme2Phoneme(language="it", cuda=False) |
|
|
print(phonemizer.phonemize([ |
|
|
"cuore, prezzemolo", "margherita, girotondo", |
|
|
"mepola, rofe", "presemma, costro", |
|
|
"lada, vufebu", "nurto, frimace", |
|
|
"banzione, ceglirate", "banzione, alcelisto" |
|
|
])) |
|
|
|