Skip to content

Commit 0117c5a

Browse files
committed
add one more method of speech synthesis to the text-to-speech tutorial
1 parent 9cf9a1d commit 0117c5a

File tree

4 files changed

+74
-3
lines changed

4 files changed

+74
-3
lines changed
Binary file not shown.

machine-learning/text-to-speech/requirements

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
pyttsx3
2+
gTTS
3+
playsound
4+
soundfile
5+
transformers
6+
datasets
7+
sentencepiece
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2+
from datasets import load_dataset
3+
import torch
4+
import random
5+
import string
6+
import soundfile as sf
7+
8+
device = "cuda" if torch.cuda.is_available() else "cpu"
9+
# load the processor
10+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11+
# load the model
12+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
13+
# load the vocoder, that is the voice encoder
14+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
15+
# we load this dataset to get the speaker embeddings
16+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
17+
18+
# speaker ids from the embeddings dataset
19+
speakers = {
20+
'awb': 0, # Scottish male
21+
'bdl': 1138, # US male
22+
'clb': 2271, # US female
23+
'jmk': 3403, # Canadian male
24+
'ksp': 4535, # Indian male
25+
'rms': 5667, # US male
26+
'slt': 6799 # US female
27+
}
28+
29+
def save_text_to_speech(text, speaker=None):
30+
# preprocess text
31+
inputs = processor(text=text, return_tensors="pt").to(device)
32+
if speaker is not None:
33+
# load xvector containing speaker's voice characteristics from a dataset
34+
speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
35+
else:
36+
# random vector, meaning a random voice
37+
speaker_embeddings = torch.randn((1, 512)).to(device)
38+
# generate speech with the models
39+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
40+
if speaker is not None:
41+
# if we have a speaker, we use the speaker's ID in the filename
42+
output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
43+
else:
44+
# if we don't have a speaker, we use a random string in the filename
45+
random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
46+
output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
47+
# save the generated speech to a file with 16KHz sampling rate
48+
sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
49+
# return the filename for reference
50+
return output_filename
51+
52+
# generate speech with a US female voice
53+
save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
54+
# generate speech with a random voice
55+
save_text_to_speech("Python is my favorite programming language")
56+
57+
# a challenging text with all speakers
58+
text = """In his miracle year, he published four groundbreaking papers.
59+
These outlined the theory of the photoelectric effect, explained Brownian motion,
60+
introduced special relativity, and demonstrated mass-energy equivalence."""
61+
62+
for speaker_name, speaker in speakers.items():
63+
output_filename = save_text_to_speech(text, speaker)
64+
print(f"Saved {output_filename}")
65+
# random speaker
66+
output_filename = save_text_to_speech(text)
67+
print(f"Saved {output_filename}")

0 commit comments

Comments
 (0)