# Imports
import os
import json
import time
import io
import random
import requests
from pydub import AudioSegment
from google.cloud import texttospeech
Set up Google TTS API
First off you will need to set up and Edge Impulse account and create your first project. You will also need a Google Cloud account with the Text to Speech API enabled: https://cloud.google.com/text-to-speech, the first million characters generated each month are free (WaveNet voices), this should be plenty for most cases as you'll only need to generate your dataset once. From google you will need to download a credentials JSON file and set it to the correct environment variable on your system to allow the python API to work: (https://developers.google.com/workspace/guides/create-credentials#service-account)
# Insert the path to your service account API key json file here for google cloud
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../path-to-google-credentials-file.json'
Generate the desired samples
First off we need to set our desired keywords and labels:
# Keyword or short sentence and label (e.g. 'hello world')
keyword = [
{'string':'edge','label':'edge'},
{'string':'impulse','label':'impulse'},
]
Then we need to set up the parameters for our speech dataset, all possible combinations will be iterated through:
languages - Choose the text to speech voice languages to use (https://cloud.google.com/text-to-speech/docs/voices)
out_length - How long each output sample should be
count - Maximum number of samples to output (if all combinations of languages, pitches etc are higher then this restricts output)
voice-dir - Where to store the clean samples before noise is added
noise-url - Which noise file to download and apply to your samples
output-folder - The final output location of the noised samples
num-copies - How many different noisy versions of each sample to create
max-noise-level - in Db,
# Out length minimum (default: 1s)
out_length = 1
# Maximum number of keywords to generate
count = 30
# Raw sample output directory
voice_dir = 'out-wav'
# Creative commons background noise from freesound.org:https://freesound.org/people/Astounded/sounds/483561/
noise_url = 'https://cdn.freesound.org/previews/483/483561_10201334-lq.ogg'
output_folder = 'out-noisy'
num_copies = 2 # Number of noisy copies to create for each input sample
max_noise_level = -5 # Maximum noise level to add in dBFS (negative value)
Then we need to check all the output folders are ready
# Check if output directory for noisey files exists and create it if it doesn't
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Check if output directory for raw voices exists and create it if it doesn't
if not os.path.exists(voice_dir):
os.makedirs(voice_dir)
Then we can generate a list of all possible parameter combinations based on the input earlier. If you have set num_copies to be smaller than the number of combinations then these options will be reduced:
# Generate all combinations of parameters
all_opts = []
for p in pitches:
for g in genders:
for l in languages:
for s in speakingRates:
for kw in keyword:
all_opts.append({
"pitch": p,
"gender": g,
"language": l,
"speakingRate": s,
"text": kw['string'],
"label": kw['label']
})
if len(all_opts)*num_copies > count:
selectEvery = len(all_opts)*num_copies // count
selectNext = 0
all_opts = all_opts[::selectEvery]
print(f'Generating {len(all_opts)*num_copies} samples')
Finally we iterate though all the options generated, call the Google TTS API to generate the desired sample, and apply noise to it, saving locally with metadata:
# Instantiate list for file label information
downloaded_files = []
# Instantiates a client
client = texttospeech.TextToSpeechClient()
ix = 0
for o in all_opts:
ix += 1
# Set the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(text=o['text'])
# Build the voice request
voice = texttospeech.VoiceSelectionParams(
language_code=o['language'],
ssml_gender=o['gender']
)
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
pitch=o['pitch'],
speaking_rate=o['speakingRate'],
sample_rate_hertz=16000
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
wav_file_name = f"{voice_dir}/{o['label']}.{o['language']}-{o['gender']}-{o['pitch']}-{o['speakingRate']}.tts.wav"
if not os.path.exists(wav_file_name):
print(f"[{ix}/{len(all_opts)}] Text-to-speeching...")
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
with open(wav_file_name, "wb") as f:
f.write(response.audio_content)
has_hit_api = True
else:
print(f'skipping {wav_file_name}')
has_hit_api = False
# Load voice sample
voice_audio = AudioSegment.from_file(wav_file_name)
# Add silence to match output length with random padding
difference = (out_length * 1000) - len(voice_audio)
if difference > 0:
padding_before = random.randint(0, difference)
padding_after = difference - padding_before
voice_audio = AudioSegment.silent(duration=padding_before) + voice_audio + AudioSegment.silent(duration=padding_after)
for i in range(num_copies):
# Save noisy sample to output folder
output_filename = f"{o['label']}.{o['language']}-{o['gender']}-{o['pitch']}-{o['speakingRate']}_noisy_{i+1}.wav"
output_path = os.path.join(output_folder, output_filename)
if not os.path.exists(output_path):
# Select random section of noise and random noise level
start_time = random.randint(0, len(noise_audio) - len(voice_audio))
end_time = start_time +len(voice_audio)
noise_level = random.uniform(max_noise_level, 0)
# Extract selected section of noise and adjust volume
noise_segment = noise_audio[start_time:end_time]
noise_segment = noise_segment - abs(noise_level)
# Mix voice sample with noise segment
mixed_audio = voice_audio.overlay(noise_segment)
# Save mixed audio to file
mixed_audio.export(output_path, format='wav')
print(f'Saved mixed audio to {output_path}')
else:
print(f'skipping {output_path}')
# Save metadata for file
downloaded_files.append({
"path": str(output_filename),
"label": o['label'],
"category": "split",
"metadata": {
"pitch": str(['pitch']),
"gender": str(o['gender']),
"language": str(o['language']),
"speakingRate": str(o['speakingRate']),
"text": o['text'],
"imported_from": "Google Cloud TTS"
}
})
if has_hit_api:
time.sleep(0.5)
print("Done text-to-speeching")
print("")
input_file = os.path.join(output_folder, 'input.json')
info_file = {
"version": 1,
"files": downloaded_files
}
# Output the metadata file
with open(input_file, "w") as f:
json.dump(info_file, f)
# Move to the out-noisy folder
! cd out-noisy
# Upload all files in the out-noisy folder with metadata attached in the input.json file
! edge-impulse-uploader --info-file input.json *
What next?
Now you can use your keywords to create a robust keyword detection model in Edge Impulse Studio!
Make use of our pre-built keyword dataset to add noise and 'unknown' words to your model: Keyword Spotting Dataset
Try out both classification models and the transfer learning keyword spotting model to see which works best for your case