""" Customized phonemization wrapper. Original wrapper by Lingjuan Zhu (@lingjzhu): https://github.com/lingjzhu/text2phonemesequence """ import csv import json import os import re import subprocess import warnings import segments import torch import transformers class Grapheme2Phoneme: def __init__( self, words_to_exclude=None, tokenizer="google/byt5-small", language="fr", cuda=True, folder_language="lang_dict", ): """ Load the rules to convert from a language to the corresponding phonemes. :param list words_to_exclude: List of words to exclude from phonemization. :param str tokenizer: Pre-trained tokenizer model name. :param str language: Language code for phonemization. Two letters only (e.g.: "fr") :param bool cuda: Flag to use CUDA for GPU acceleration. :param str folder_language: Folder path to save language dictionaries. """ self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer) self.model = transformers.T5ForConditionalGeneration.from_pretrained( "charsiu/g2p_multilingual_byT5_small_100" ) device = "cpu" if cuda: if torch.cuda.is_available(): device = "cuda:0" else: warnings.warn("CUDA is not available but was requested") self.device = torch.device(device) self.model = self.model.to(self.device) if words_to_exclude is None: words_to_exclude = ["[UNK]"] self.exclude_token = words_to_exclude self.segment_tool = segments.Tokenizer() self.language = { "fr": "fra", "it": "ita" }[language] self.phone_dict = {} # Download language dictionary if not exists os.makedirs(folder_language, exist_ok=True) language_path = os.path.join(folder_language, self.language + ".tsv") if not os.path.exists(language_path): subprocess.run( [ "wget", "-O", language_path, f"https://raw.githubusercontent.com/lingjzhu/CharsiuG2P/main/dicts/{self.language}.tsv" ], check=True ) # Setting phonemes length config_path = "configs/phoneme_lengths.json" if os.path.exists(config_path): with open(config_path, "r", encoding="utf-8") as file: self.phoneme_lengths = json.load(file) else: warnings.warn("Loading dummy values for phonemes lengths!") self.phoneme_lengths = { "fra.tsv": 50, "ita.tsv": 50 } if os.path.exists(language_path): with open(language_path, "r", encoding="utf-8") as file: tsv_file = csv.reader(file, delimiter="\t") # printing data line by line for word_phonemes in tsv_file: assert len(word_phonemes) == 2 self.phone_dict[word_phonemes[0]] = [word_phonemes[1].split(",")[0]] def phonemize(self, words): """ Convert text to phonemes using the T5 model. :param list[str] words: Input text to be converted. :return list[str]: Phonemized words. """ phonemes_list = [""] * len(words) new_words = [] for i, word in enumerate(words): # First normalize the spacing around special tokens word = re.sub(r"(?", word) word = re.sub(r"(?", word) # Then collapse multiple spaces word = re.sub(r" +", " ", word).strip() if word in self.phone_dict: phonemes_list[i] = self.phone_dict[word][0] elif word in self.exclude_token: phonemes_list[i] = word else: new_words.append((i, word)) # Then batch the unknown words if new_words: out = self.tokenizer( [f"<{self.language}>: {word[1]}" for word in new_words], padding=True, add_special_tokens=False, return_tensors="pt", ) predictions = self.model.generate( input_ids=out["input_ids"].to(self.device), attention_mask=out["attention_mask"].to(self.device), num_beams=1, max_length=self.phoneme_lengths[self.language + ".tsv"], ) phonemes = self.tokenizer.batch_decode( predictions, skip_special_tokens=True ) for i, phonemized in enumerate(phonemes): phonemes_list[new_words[i][0]] = phonemized for i, phoneme in enumerate(phonemes_list): # skip excluded tokens of segmentation if phoneme not in self.exclude_token: try: phonemes_list[i] = self.segment_tool(phoneme, ipa=True) except IndexError: # Some phonemes may not be recognized phonemes_list[i] = self.segment_tool(phoneme) # fill gaps with padding token return phonemes_list if __name__ == "__main__": phonemizer = Grapheme2Phoneme(language="it", cuda=False) print(phonemizer.phonemize([ "cuore, prezzemolo", "margherita, girotondo", "mepola, rofe", "presemma, costro", "lada, vufebu", "nurto, frimace", "banzione, ceglirate", "banzione, alcelisto" ]))