Kokoro can also speak untrained languages ?! (examples: Greek and Armenian).
In the model card, I noticed that the user can provide Kokoro with the exact IPA text for audio export. After a few trials and errors, I have created two simple draft Python files for converting Greek and Armenian text to tokens that Kokoro can understand and produce speech. Of course, the results are not perfect, as it inherits the tone of the initial speaker (better results give em_santa hoho MC XD). If there is a better implementation for a natural tone, it would be awesome.
@hexgrad
, do you have any suggestions?
Please note that the code can seamlessly produce Greek and Armenian text
I also provide a simple GUI (created with LLM) for the community to play with, hoping for some improvements.
Finally, I provide an audio example.
put in same directory
greek.py
armenian.py
inference.py
simple_gui.py
and have fun :)
-------------------------------------------------------------------------------------------------------------------
greek.py
class GreekToIPA:
def __init__(self):
# Basic Greek to IPA mapping
self.greek_to_ipa = {
'α': 'a',
'ά': 'ˈa',
'β': 'v',
'γ': 'gh',
'δ': 'ð',
'ε': 'ɛ',
'έ': 'ˈɛ',
'ζ': 'z',
'η': 'i',
'ή': 'ˈi',
'θ': 'th',
'ι': 'i',
'ί': 'ˈi',
'κ': 'k',
'λ': 'l',
'μ': 'm',
'ν': 'n',
'ξ': 'ks',
'ο': 'o',
'ό': 'ˈo',
'π': 'p',
'ρ': 'r',
'σ': 's',
'ς': 's',
'τ': 't',
'υ': 'i',
'ύ': 'ˈi',
'φ': 'f',
'χ': 'x',
'ψ': 'ps',
'ω': 'o',
'ώ': 'ˈo',
'ϊ': 'i',
'ΐ': 'ˈi',
'ϋ': 'i',
'ΰ': 'ˈi'
}
# Special combinations
self.digraphs = {
'αι': 'ɛ',
'άι': 'ˈai',
'αί': 'ɛˈ',
'ει': 'i',
'εί': 'ˈi',
'οι': 'i',
'οί': 'ˈi',
'ού': 'ˈu',
'ου': 'u',
'μπ': 'b',
'ντ': 'd',
'γκ': 'ghk',
'γγ': 'ŋg',
'αυ': 'av',
'άυ': 'ˈav',
'ευ': 'ev',
'έυ': 'ˈev',
'εύ': 'ˈev'
}
# New dictionary for numbers 0-10
self.numbers = {
'0': 'μηδέν',
'1': 'ένα',
'2': 'δύο',
'3': 'τρία',
'4': 'τέσσερα',
'5': 'πέντε',
'6': 'έξι',
'7': 'επτά',
'8': 'οκτώ',
'9': 'εννέα',
'10': 'δέκα'
}
# Time format dictionary
self.time_formats = {
'00:00': 'μεσάνυχτα',
'12:00': 'μεσημέρι',
'13:00': 'μία',
'14:00': 'δύο',
'15:00': 'τρεις',
'16:00': 'τέσσερις',
'17:00': 'πέντε',
'18:00': 'έξι',
'19:00': 'επτά',
'20:00': 'οκτώ',
'21:00': 'εννέα',
'22:00': 'δέκα',
'23:00': 'έντεκα'
}
def preprocess_text(self, text):
"""Convert numbers and time formats to Greek text before IPA conversion."""
words = text.split()
processed_words = []
for word in words:
# Check for time format (HH:MM)
if ':' in word and len(word) == 5:
if word in self.time_formats:
processed_words.append(self.time_formats[word])
continue
# Check for numbers
if word.isdigit():
if word in self.numbers:
processed_words.append(self.numbers[word])
continue
processed_words.append(word)
return ' '.join(processed_words)
def convert_word(self, greek_word):
"""Convert a single Greek word to IPA."""
if not greek_word:
return ''
word = greek_word.lower()
for digraph, ipa in self.digraphs.items():
word = word.replace(digraph, ipa)
ipa = ''
i = 0
while i < len(word):
if word[i] in self.greek_to_ipa:
ipa += self.greek_to_ipa[word[i]]
else:
ipa += word[i]
i += 1
return ipa
def convert_text(self, text):
"""Convert Greek text to IPA with formatting."""
# First preprocess the text
preprocessed_text = self.preprocess_text(text)
words = preprocessed_text.split()
converted = []
for word in words:
if any(c in self.greek_to_ipa for c in word.lower()):
ipa = self.convert_word(word)
converted.append(f'[{word}](/{ipa}/)')
else:
converted.append(word)
return ' '.join(converted)
# # Example usage
# if __name__ == "__main__":
# converter = GreekToIPA()
# test_text = "Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά."
# result = converter.convert_text(test_text)
# print(result)
-------------------------------------------------------------------------------------------------------------------
armenian.py
class ArmenianToIPA:
def __init__(self):
# Basic Armenian to IPA mapping
self.armenian_to_ipa = {
'ա': 'ɑ',
'բ': 'b',
'գ': 'ɡ',
'դ': 'd',
'ե': 'ɛ',
'զ': 'z',
'է': 'ɛ',
'ը': 'ə',
'թ': 'th',
'ժ': 'ʒ',
'ի': 'i',
'լ': 'l',
'խ': 'x',
'ծ': 'ts',
'կ': 'kə',
'հ': 'h',
'ձ': 'dz',
'ղ': 'ʁ',
'ճ': 'tʃ',
'մ': 'm',
'յ': 'j',
'ն': 'n',
'շ': 'ʃ',
'ո': 'o',
'չ': 'tʃh',
'պ': 'p',
'ջ': 'dʒ',
'ռ': 'rr',
'ս': 's',
'վ': 'v',
'տ': 't',
'ր': 'ɾr',
'ց': 'tsh',
'ւ': 'v',
'փ': 'ph',
'ք': 'kh',
'օ': 'o',
'ֆ': 'f',
'և': 'jev'
}
# Rest of the dictionaries remain the same
self.numbers = {
'0': 'զրո',
'1': 'մեկ',
'2': 'երկու',
'3': 'երեք',
'4': 'չորս',
'5': 'հինգ',
'6': 'վեց',
'7': 'յոթ',
'8': 'ութ',
'9': 'ինը',
'10': 'տասը'
}
self.time_formats = {
'00:00': 'կեսգիշեր',
'12:00': 'կեսօր',
'13:00': 'ժամը մեկ',
'14:00': 'ժամը երկու',
'15:00': 'ժամը երեք',
'16:00': 'ժամը չորս',
'17:00': 'ժամը հինգ',
'18:00': 'ժամը վեց',
'19:00': 'ժամը յոթ',
'20:00': 'ժամը ութ',
'21:00': 'ժամը ինը',
'22:00': 'ժամը տասը',
'23:00': 'ժամը տասնմեկ'
}
self.digraphs = {
'ու': 'u',
'իւ': 'ju',
'եա': 'ja',
'եո': 'jo',
'եւ': 'ev',
}
def preprocess_text(self, text):
"""Convert numbers and time formats to Armenian text before IPA conversion."""
words = text.split()
processed_words = []
for word in words:
if ':' in word and len(word) == 5:
if word in self.time_formats:
processed_words.append(self.time_formats[word])
continue
if word.isdigit():
if word in self.numbers:
processed_words.append(self.numbers[word])
continue
processed_words.append(word)
return ' '.join(processed_words)
def convert_word(self, armenian_word):
"""Convert a single Armenian word to IPA."""
if not armenian_word:
return ''
word = armenian_word.lower()
# Handle digraphs first
for digraph, ipa in self.digraphs.items():
word = word.replace(digraph, ipa)
ipa = ''
for i, char in enumerate(word):
if char == 'ե':
ipa += 'jɛ' if i == 0 else 'ɛ'
elif char == 'կ':
ipa += 'k' if i == len(word) - 1 else 'kə'
elif char == 'ո':
ipa += 'vo' if i == 0 else 'o'
elif char in self.armenian_to_ipa:
ipa += self.armenian_to_ipa[char]
else:
ipa += char
return ipa
def convert_text(self, text):
"""Convert Armenian text to IPA with formatting."""
preprocessed_text = self.preprocess_text(text)
words = preprocessed_text.split()
converted = []
for word in words:
if any(c in self.armenian_to_ipa for c in word):
ipa = self.convert_word(word)
converted.append(f'[{word}](/{ipa}/)')
else:
converted.append(word)
return ' '.join(converted)
# # Example usage
# if __name__ == "__main__":
# converter = ArmenianToIPA()
# # Test examples
# test_text = "Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:"
# result = converter.convert_text(test_text)
# print(result)
-------------------------------------------------------------------------------------------------------------------
inference.py
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from greek import GreekToIPA
from armenian import ArmenianToIPA
pipeline = KPipeline(lang_code='a')
text = """
Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά.
Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:
"""
gr_converter = GreekToIPA()
arm_converter = ArmenianToIPA()
# Test example
greek_check=True
if greek_check:
text = gr_converter.convert_text(text)
print(text)
# Test example
armenian_check=True
if armenian_check:
text = arm_converter.convert_text(text)
print(text)
# Create a list to store all audio segments
all_audio = []
generator = pipeline(
text, voice='em_santa',
speed=1.0, split_pattern=r'\n+'
)
# Collect all audio segments
for i, (gs, ps, audio) in enumerate(generator):
all_audio.append(audio)
# Concatenate all audio segments
combined_audio = np.concatenate(all_audio)
# Save the combined audio
sf.write('combined_output.wav', combined_audio, 24000)
###-------------------------------------------------------------------------------------------------------------------
simple_gui.py
import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import wave
import pyaudio
import tempfile
import threading
import queue
from armenian import ArmenianToIPA
from greek import GreekToIPA
gr_converter = GreekToIPA()
hy_converter = ArmenianToIPA()
class AudioPlayer:
def __init__(self):
self.pyaudio = pyaudio.PyAudio()
self.stream = None
self.is_playing = False
self.is_paused = False
self.audio_thread = None
self.current_file = None
def play_file(self, filename, chunk_size=1024):
if self.is_paused and self.current_file == filename:
self.is_paused = False
self.is_playing = True
return
if self.is_playing:
self.stop()
self.current_file = filename
self.is_playing = True
self.is_paused = False
def stream_audio():
wf = wave.open(filename, 'rb')
self.stream = self.pyaudio.open(
format=self.pyaudio.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True
)
data = wf.readframes(chunk_size)
while data and self.is_playing:
if not self.is_paused:
self.stream.write(data)
data = wf.readframes(chunk_size)
self.stream.stop_stream()
self.stream.close()
wf.close()
self.is_playing = False
self.is_paused = False
self.audio_thread = threading.Thread(target=stream_audio)
self.audio_thread.daemon = True
self.audio_thread.start()
def pause(self):
self.is_paused = True
def stop(self):
self.is_playing = False
if self.audio_thread:
self.audio_thread.join(timeout=1)
def __del__(self):
self.stop()
self.pyaudio.terminate()
class TTSApp:
def __init__(self, root):
self.root = root
self.root.title("TTS Generator")
self.audio_player = AudioPlayer()
self.voices = [
'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica',
'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
]
self.lang_codes = ['a', 'b', 'j', 'z']
self.lang_code = tk.StringVar(value='a')
self.voice = tk.StringVar(value='em_santa')
self.speed = tk.DoubleVar(value=1.0)
self.speed_label = tk.StringVar(value="Speed: 1.0")
self.current_audio = None
self.is_processing = False
self.create_widgets()
self.create_loading_indicator()
def update_speed_label(self, *args):
self.speed_label.set(f"Speed: {self.speed.get():.1f}")
def create_widgets(self):
main_frame = ttk.Frame(self.root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Text input with label
ttk.Label(main_frame, text="Input Text:").pack(anchor='w')
self.text_input = scrolledtext.ScrolledText(main_frame, height=10)
self.text_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
# Parameters frame
params_frame = ttk.LabelFrame(main_frame, text="Parameters")
params_frame.pack(fill=tk.X, pady=5)
# Grid layout for parameters
for i, (label, var, values) in enumerate([
("Language Code:", self.lang_code, self.lang_codes),
("Voice:", self.voice, self.voices)
]):
ttk.Label(params_frame, text=label).grid(row=i, column=0, padx=5, pady=2)
combo = ttk.Combobox(params_frame, textvariable=var, values=values, state='readonly')
combo.grid(row=i, column=1, padx=5, pady=2, sticky='ew')
# Speed control
ttk.Label(params_frame, textvariable=self.speed_label).grid(row=2, column=0, padx=5, pady=2)
speed_scale = ttk.Scale(params_frame, from_=0.5, to=2.0, variable=self.speed,
orient=tk.HORIZONTAL, command=lambda x: self.update_speed_label())
speed_scale.grid(row=2, column=1, padx=5, pady=2, sticky='ew')
# Configure grid columns
params_frame.grid_columnconfigure(1, weight=1)
# Buttons frame
buttons_frame = ttk.Frame(main_frame)
buttons_frame.pack(pady=10)
# Generate button
self.generate_btn = ttk.Button(buttons_frame, text="Generate",
command=self.start_generation)
self.generate_btn.pack(side=tk.LEFT, padx=5)
# Audio control buttons
self.play_btn = ttk.Button(buttons_frame, text="Play",
command=self.play_audio, state='disabled')
self.play_btn.pack(side=tk.LEFT, padx=5)
self.pause_btn = ttk.Button(buttons_frame, text="Pause",
command=self.pause_audio, state='disabled')
self.pause_btn.pack(side=tk.LEFT, padx=5)
self.stop_btn = ttk.Button(buttons_frame, text="Stop",
command=self.stop_audio, state='disabled')
self.stop_btn.pack(side=tk.LEFT, padx=5)
self.save_btn = ttk.Button(buttons_frame, text="Save",
command=self.save_audio, state='disabled')
self.save_btn.pack(side=tk.LEFT, padx=5)
def create_loading_indicator(self):
self.loading_frames = []
chars = ["|", "/", "-", "\\"]
for char in chars:
label = ttk.Label(self.root, text=char, font=('Courier', 24))
self.loading_frames.append(label)
self.current_frame = 0
def animate_loading(self):
if self.is_processing:
self.loading_frames[self.current_frame].place_forget()
self.current_frame = (self.current_frame + 1) % len(self.loading_frames)
self.loading_frames[self.current_frame].place(relx=0.5, rely=0.5, anchor='center')
self.root.after(100, self.animate_loading)
else:
self.loading_frames[self.current_frame].place_forget()
def start_generation(self):
self.is_processing = True
self.generate_btn.configure(state='disabled')
self.play_btn.configure(state='disabled')
self.pause_btn.configure(state='disabled')
self.stop_btn.configure(state='disabled')
self.save_btn.configure(state='disabled')
self.animate_loading()
thread = threading.Thread(target=self.generate_audio)
thread.daemon = True
thread.start()
def generate_audio(self):
text = self.text_input.get("1.0", tk.END).strip()
if text:
try:
pipeline = KPipeline(lang_code=self.lang_code.get())
all_audio = []
text = gr_converter.convert_text(text)
text = hy_converter.convert_text(text)
print(text)
generator = pipeline(
text, voice=self.voice.get(),
speed=self.speed.get(), split_pattern=r'\n+'
)
for _, _, audio in generator:
all_audio.append(audio)
self.current_audio = np.concatenate(all_audio)
self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
sf.write(self.temp_file.name, self.current_audio, 24000)
except Exception as e:
print("Error", str(e))
# self.root.after(0, lambda: messagebox.showerror("Error", str(e)))
self.is_processing = False
self.root.after(0, self.generation_complete)
def generation_complete(self):
self.generate_btn.configure(state='normal')
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='normal')
self.stop_btn.configure(state='normal')
self.save_btn.configure(state='normal')
def play_audio(self):
if self.current_audio is not None:
self.audio_player.play_file(self.temp_file.name)
self.play_btn.configure(state='disabled')
self.pause_btn.configure(state='normal')
self.stop_btn.configure(state='normal')
def pause_audio(self):
self.audio_player.pause()
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='disabled')
def stop_audio(self):
self.audio_player.stop()
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='disabled')
self.stop_btn.configure(state='disabled')
def save_audio(self):
if self.current_audio is not None:
sf.write('output_audio.wav', self.current_audio, 24000)
messagebox.showinfo("Success", "Audio saved as 'output_audio.wav'")
def __del__(self):
if hasattr(self, 'audio_player'):
self.audio_player.stop()
if __name__ == "__main__":
root = tk.Tk()
app = TTSApp(root)
root.mainloop()