1
+ from transformers import SpeechT5Processor , SpeechT5ForTextToSpeech , SpeechT5HifiGan
2
+ from datasets import load_dataset
3
+ import torch
4
+ import random
5
+ import string
6
+ import soundfile as sf
7
+
8
+ device = "cuda" if torch .cuda .is_available () else "cpu"
9
+ # load the processor
10
+ processor = SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
11
+ # load the model
12
+ model = SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" ).to (device )
13
+ # load the vocoder, that is the voice encoder
14
+ vocoder = SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" ).to (device )
15
+ # we load this dataset to get the speaker embeddings
16
+ embeddings_dataset = load_dataset ("Matthijs/cmu-arctic-xvectors" , split = "validation" )
17
+
18
+ # speaker ids from the embeddings dataset
19
+ speakers = {
20
+ 'awb' : 0 , # Scottish male
21
+ 'bdl' : 1138 , # US male
22
+ 'clb' : 2271 , # US female
23
+ 'jmk' : 3403 , # Canadian male
24
+ 'ksp' : 4535 , # Indian male
25
+ 'rms' : 5667 , # US male
26
+ 'slt' : 6799 # US female
27
+ }
28
+
29
+ def save_text_to_speech (text , speaker = None ):
30
+ # preprocess text
31
+ inputs = processor (text = text , return_tensors = "pt" ).to (device )
32
+ if speaker is not None :
33
+ # load xvector containing speaker's voice characteristics from a dataset
34
+ speaker_embeddings = torch .tensor (embeddings_dataset [speaker ]["xvector" ]).unsqueeze (0 ).to (device )
35
+ else :
36
+ # random vector, meaning a random voice
37
+ speaker_embeddings = torch .randn ((1 , 512 )).to (device )
38
+ # generate speech with the models
39
+ speech = model .generate_speech (inputs ["input_ids" ], speaker_embeddings , vocoder = vocoder )
40
+ if speaker is not None :
41
+ # if we have a speaker, we use the speaker's ID in the filename
42
+ output_filename = f"{ speaker } -{ '-' .join (text .split ()[:6 ])} .mp3"
43
+ else :
44
+ # if we don't have a speaker, we use a random string in the filename
45
+ random_str = '' .join (random .sample (string .ascii_letters + string .digits , k = 5 ))
46
+ output_filename = f"{ random_str } -{ '-' .join (text .split ()[:6 ])} .mp3"
47
+ # save the generated speech to a file with 16KHz sampling rate
48
+ sf .write (output_filename , speech .cpu ().numpy (), samplerate = 16000 )
49
+ # return the filename for reference
50
+ return output_filename
51
+
52
+ # generate speech with a US female voice
53
+ save_text_to_speech ("Python is my favorite programming language" , speaker = speakers ["slt" ])
54
+ # generate speech with a random voice
55
+ save_text_to_speech ("Python is my favorite programming language" )
56
+
57
+ # a challenging text with all speakers
58
+ text = """In his miracle year, he published four groundbreaking papers.
59
+ These outlined the theory of the photoelectric effect, explained Brownian motion,
60
+ introduced special relativity, and demonstrated mass-energy equivalence."""
61
+
62
+ for speaker_name , speaker in speakers .items ():
63
+ output_filename = save_text_to_speech (text , speaker )
64
+ print (f"Saved { output_filename } " )
65
+ # random speaker
66
+ output_filename = save_text_to_speech (text )
67
+ print (f"Saved { output_filename } " )
0 commit comments