Text-to-Speech
English

Kokoro can also speak untrained languages ?! (examples: Greek and Armenian).

#105
by vahanAI - opened

In the model card, I noticed that the user can provide Kokoro with the exact IPA text for audio export. After a few trials and errors, I have created two simple draft Python files for converting Greek and Armenian text to tokens that Kokoro can understand and produce speech. Of course, the results are not perfect, as it inherits the tone of the initial speaker (better results give em_santa hoho MC XD). If there is a better implementation for a natural tone, it would be awesome. @hexgrad , do you have any suggestions?
Please note that the code can seamlessly produce Greek and Armenian text
I also provide a simple GUI (created with LLM) for the community to play with, hoping for some improvements.
Finally, I provide an audio example.

put in same directory
greek.py
armenian.py
inference.py
simple_gui.py

and have fun :)

-------------------------------------------------------------------------------------------------------------------

greek.py

class GreekToIPA:
    def __init__(self):
        # Basic Greek to IPA mapping
        self.greek_to_ipa = {
            'α': 'a',
            'ά': 'ˈa',
            'β': 'v',
            'γ': 'gh',
            'δ': 'ð',
            'ε': 'ɛ',
            'έ': 'ˈɛ',
            'ζ': 'z',
            'η': 'i',
            'ή': 'ˈi',
            'θ': 'th',
            'ι': 'i',
            'ί': 'ˈi',
            'κ': 'k',
            'λ': 'l',
            'μ': 'm',
            'ν': 'n',
            'ξ': 'ks',
            'ο': 'o',
            'ό': 'ˈo',
            'π': 'p',
            'ρ': 'r',
            'σ': 's',
            'ς': 's',
            'τ': 't',
            'υ': 'i',
            'ύ': 'ˈi',
            'φ': 'f',
            'χ': 'x',
            'ψ': 'ps',
            'ω': 'o',
            'ώ': 'ˈo',
            'ϊ': 'i',
            'ΐ': 'ˈi',
            'ϋ': 'i',
            'ΰ': 'ˈi'
        }
        
        # Special combinations
        self.digraphs = {
            'αι': 'ɛ',
            'άι': 'ˈai',
            'αί': 'ɛˈ',
            'ει': 'i',
            'εί': 'ˈi',
            'οι': 'i',
            'οί': 'ˈi',
            'ού': 'ˈu',
            'ου': 'u',
            'μπ': 'b',
            'ντ': 'd',
            'γκ': 'ghk',
            'γγ': 'ŋg',
            'αυ': 'av',
            'άυ': 'ˈav',
            'ευ': 'ev',
            'έυ': 'ˈev',
            'εύ': 'ˈev'
        }

        # New dictionary for numbers 0-10
        self.numbers = {
            '0': 'μηδέν',
            '1': 'ένα',
            '2': 'δύο',
            '3': 'τρία',
            '4': 'τέσσερα',
            '5': 'πέντε',
            '6': 'έξι',
            '7': 'επτά',
            '8': 'οκτώ',
            '9': 'εννέα',
            '10': 'δέκα'
        }

        # Time format dictionary
        self.time_formats = {
            '00:00': 'μεσάνυχτα',
            '12:00': 'μεσημέρι',
            '13:00': 'μία',
            '14:00': 'δύο',
            '15:00': 'τρεις',
            '16:00': 'τέσσερις',
            '17:00': 'πέντε',
            '18:00': 'έξι',
            '19:00': 'επτά',
            '20:00': 'οκτώ',
            '21:00': 'εννέα',
            '22:00': 'δέκα',
            '23:00': 'έντεκα'
        }

    def preprocess_text(self, text):
        """Convert numbers and time formats to Greek text before IPA conversion."""
        words = text.split()
        processed_words = []
        
        for word in words:
            # Check for time format (HH:MM)
            if ':' in word and len(word) == 5:
                if word in self.time_formats:
                    processed_words.append(self.time_formats[word])
                    continue
            
            # Check for numbers
            if word.isdigit():
                if word in self.numbers:
                    processed_words.append(self.numbers[word])
                    continue
            
            processed_words.append(word)
            
        return ' '.join(processed_words)

    def convert_word(self, greek_word):
        """Convert a single Greek word to IPA."""
        if not greek_word:
            return ''
            
        word = greek_word.lower()
        
        for digraph, ipa in self.digraphs.items():
            word = word.replace(digraph, ipa)
        
        ipa = ''
        i = 0
        while i < len(word):
            if word[i] in self.greek_to_ipa:
                ipa += self.greek_to_ipa[word[i]]
            else:
                ipa += word[i]
            i += 1
            
        return ipa

    def convert_text(self, text):
        """Convert Greek text to IPA with formatting."""
        # First preprocess the text
        preprocessed_text = self.preprocess_text(text)
        
        words = preprocessed_text.split()
        converted = []
        for word in words:
            if any(c in self.greek_to_ipa for c in word.lower()):
                ipa = self.convert_word(word)
                converted.append(f'[{word}](/{ipa}/)')
            else:
                converted.append(word)
        return ' '.join(converted)

# # Example usage
# if __name__ == "__main__":
#     converter = GreekToIPA()
#     test_text = "Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά."
#     result = converter.convert_text(test_text)
#     print(result)

-------------------------------------------------------------------------------------------------------------------

armenian.py

class ArmenianToIPA:
    def __init__(self):
        # Basic Armenian to IPA mapping
        self.armenian_to_ipa = {
            'ա': 'ɑ',
            'բ': 'b',
            'գ': 'ɡ',
            'դ': 'd',
            'ե': 'ɛ',
            'զ': 'z',
            'է': 'ɛ',
            'ը': 'ə',
            'թ': 'th',
            'ժ': 'ʒ',
            'ի': 'i',
            'լ': 'l',
            'խ': 'x',
            'ծ': 'ts',
            'կ': 'kə',
            'հ': 'h',
            'ձ': 'dz',
            'ղ': 'ʁ',
            'ճ': 'tʃ',
            'մ': 'm',
            'յ': 'j',
            'ն': 'n',
            'շ': 'ʃ',
            'ո': 'o',
            'չ': 'tʃh',
            'պ': 'p',
            'ջ': 'dʒ',
            'ռ': 'rr',
            'ս': 's',
            'վ': 'v',
            'տ': 't',
            'ր': 'ɾr',
            'ց': 'tsh',
            'ւ': 'v',
            'փ': 'ph',
            'ք': 'kh',
            'օ': 'o',
            'ֆ': 'f',
            'և': 'jev'
        }
        # Rest of the dictionaries remain the same
        self.numbers = {
            '0': 'զրո',
            '1': 'մեկ',
            '2': 'երկու',
            '3': 'երեք',
            '4': 'չորս',
            '5': 'հինգ',
            '6': 'վեց',
            '7': 'յոթ',
            '8': 'ութ',
            '9': 'ինը',
            '10': 'տասը'
        }

        self.time_formats = {
            '00:00': 'կեսգիշեր',
            '12:00': 'կեսօր',
            '13:00': 'ժամը մեկ',
            '14:00': 'ժամը երկու',
            '15:00': 'ժամը երեք',
            '16:00': 'ժամը չորս',
            '17:00': 'ժամը հինգ',
            '18:00': 'ժամը վեց',
            '19:00': 'ժամը յոթ',
            '20:00': 'ժամը ութ',
            '21:00': 'ժամը ինը',
            '22:00': 'ժամը տասը',
            '23:00': 'ժամը տասնմեկ'
        }

        self.digraphs = {
            'ու': 'u',
            'իւ': 'ju',
            'եա': 'ja',
            'եո': 'jo',
            'եւ': 'ev',
        }

    def preprocess_text(self, text):
        """Convert numbers and time formats to Armenian text before IPA conversion."""
        words = text.split()
        processed_words = []
        
        for word in words:
            if ':' in word and len(word) == 5:
                if word in self.time_formats:
                    processed_words.append(self.time_formats[word])
                    continue
            
            if word.isdigit():
                if word in self.numbers:
                    processed_words.append(self.numbers[word])
                    continue
            
            processed_words.append(word)
            
        return ' '.join(processed_words)

    def convert_word(self, armenian_word):
        """Convert a single Armenian word to IPA."""
        if not armenian_word:
            return ''
            
        word = armenian_word.lower()
        
        # Handle digraphs first
        for digraph, ipa in self.digraphs.items():
            word = word.replace(digraph, ipa)
        
        ipa = ''
        for i, char in enumerate(word):
            if char == 'ե':
                ipa += 'jɛ' if i == 0 else 'ɛ'
            elif char == 'կ':
                ipa += 'k' if i == len(word) - 1 else 'kə'
            elif char == 'ո':
                ipa += 'vo' if i == 0 else 'o'
            elif char in self.armenian_to_ipa:
                ipa += self.armenian_to_ipa[char]
            else:
                ipa += char
            
        return ipa

    def convert_text(self, text):
        """Convert Armenian text to IPA with formatting."""
        preprocessed_text = self.preprocess_text(text)
        
        words = preprocessed_text.split()
        converted = []
        for word in words:
            if any(c in self.armenian_to_ipa for c in word):
                ipa = self.convert_word(word)
                converted.append(f'[{word}](/{ipa}/)')
            else:
                converted.append(word)
        return ' '.join(converted)

# # Example usage
# if __name__ == "__main__":
#     converter = ArmenianToIPA()
    
#     # Test examples
#     test_text = "Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:"
#     result = converter.convert_text(test_text)
#     print(result)

-------------------------------------------------------------------------------------------------------------------

inference.py

from kokoro import KPipeline
import soundfile as sf
import numpy as np
from greek import GreekToIPA
from armenian import ArmenianToIPA

pipeline = KPipeline(lang_code='a')

text = """
Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά.
Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:
"""

gr_converter = GreekToIPA()
arm_converter = ArmenianToIPA()

# Test example
greek_check=True
if greek_check:
    text = gr_converter.convert_text(text)
    print(text)

# Test example
armenian_check=True
if armenian_check:
    text = arm_converter.convert_text(text)
    print(text)

# Create a list to store all audio segments
all_audio = []

generator = pipeline(
    text, voice='em_santa',
    speed=1.0, split_pattern=r'\n+'
)

# Collect all audio segments
for i, (gs, ps, audio) in enumerate(generator):
    all_audio.append(audio)

# Concatenate all audio segments
combined_audio = np.concatenate(all_audio)

# Save the combined audio
sf.write('combined_output.wav', combined_audio, 24000)

###-------------------------------------------------------------------------------------------------------------------

simple_gui.py

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import wave
import pyaudio
import tempfile
import threading
import queue
from armenian import ArmenianToIPA
from greek import GreekToIPA

gr_converter = GreekToIPA()
hy_converter = ArmenianToIPA()

class AudioPlayer:
    def __init__(self):
        self.pyaudio = pyaudio.PyAudio()
        self.stream = None
        self.is_playing = False
        self.is_paused = False
        self.audio_thread = None
        self.current_file = None

    def play_file(self, filename, chunk_size=1024):
        if self.is_paused and self.current_file == filename:
            self.is_paused = False
            self.is_playing = True
            return

        if self.is_playing:
            self.stop()

        self.current_file = filename
        self.is_playing = True
        self.is_paused = False
        
        def stream_audio():
            wf = wave.open(filename, 'rb')
            self.stream = self.pyaudio.open(
                format=self.pyaudio.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True
            )
            
            data = wf.readframes(chunk_size)
            while data and self.is_playing:
                if not self.is_paused:
                    self.stream.write(data)
                    data = wf.readframes(chunk_size)
            
            self.stream.stop_stream()
            self.stream.close()
            wf.close()
            self.is_playing = False
            self.is_paused = False

        self.audio_thread = threading.Thread(target=stream_audio)
        self.audio_thread.daemon = True
        self.audio_thread.start()

    def pause(self):
        self.is_paused = True

    def stop(self):
        self.is_playing = False
        if self.audio_thread:
            self.audio_thread.join(timeout=1)

    def __del__(self):
        self.stop()
        self.pyaudio.terminate()

class TTSApp:
    def __init__(self, root):
        self.root = root
        self.root.title("TTS Generator")
        self.audio_player = AudioPlayer()
        
        self.voices = [
            'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 
            'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
            'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
            'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
            'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
            'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
            'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
            'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
            'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
            'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
        ]
        self.lang_codes = ['a', 'b', 'j', 'z']
        
        self.lang_code = tk.StringVar(value='a')
        self.voice = tk.StringVar(value='em_santa')
        self.speed = tk.DoubleVar(value=1.0)
        self.speed_label = tk.StringVar(value="Speed: 1.0")
        self.current_audio = None
        self.is_processing = False
        
        self.create_widgets()
        self.create_loading_indicator()

    def update_speed_label(self, *args):
        self.speed_label.set(f"Speed: {self.speed.get():.1f}")

    def create_widgets(self):
        main_frame = ttk.Frame(self.root)
        main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)

        # Text input with label
        ttk.Label(main_frame, text="Input Text:").pack(anchor='w')
        self.text_input = scrolledtext.ScrolledText(main_frame, height=10)
        self.text_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
        
        # Parameters frame
        params_frame = ttk.LabelFrame(main_frame, text="Parameters")
        params_frame.pack(fill=tk.X, pady=5)
        
        # Grid layout for parameters
        for i, (label, var, values) in enumerate([
            ("Language Code:", self.lang_code, self.lang_codes),
            ("Voice:", self.voice, self.voices)
        ]):
            ttk.Label(params_frame, text=label).grid(row=i, column=0, padx=5, pady=2)
            combo = ttk.Combobox(params_frame, textvariable=var, values=values, state='readonly')
            combo.grid(row=i, column=1, padx=5, pady=2, sticky='ew')
        
        # Speed control
        ttk.Label(params_frame, textvariable=self.speed_label).grid(row=2, column=0, padx=5, pady=2)
        speed_scale = ttk.Scale(params_frame, from_=0.5, to=2.0, variable=self.speed,
                              orient=tk.HORIZONTAL, command=lambda x: self.update_speed_label())
        speed_scale.grid(row=2, column=1, padx=5, pady=2, sticky='ew')
        
        # Configure grid columns
        params_frame.grid_columnconfigure(1, weight=1)
        
        # Buttons frame
        buttons_frame = ttk.Frame(main_frame)
        buttons_frame.pack(pady=10)
        
        # Generate button
        self.generate_btn = ttk.Button(buttons_frame, text="Generate", 
                                     command=self.start_generation)
        self.generate_btn.pack(side=tk.LEFT, padx=5)
        
        # Audio control buttons
        self.play_btn = ttk.Button(buttons_frame, text="Play", 
                                 command=self.play_audio, state='disabled')
        self.play_btn.pack(side=tk.LEFT, padx=5)
        
        self.pause_btn = ttk.Button(buttons_frame, text="Pause", 
                                  command=self.pause_audio, state='disabled')
        self.pause_btn.pack(side=tk.LEFT, padx=5)
        
        self.stop_btn = ttk.Button(buttons_frame, text="Stop", 
                                 command=self.stop_audio, state='disabled')
        self.stop_btn.pack(side=tk.LEFT, padx=5)
        
        self.save_btn = ttk.Button(buttons_frame, text="Save", 
                                 command=self.save_audio, state='disabled')
        self.save_btn.pack(side=tk.LEFT, padx=5)

    def create_loading_indicator(self):
        self.loading_frames = []
        chars = ["|", "/", "-", "\\"]
        for char in chars:
            label = ttk.Label(self.root, text=char, font=('Courier', 24))
            self.loading_frames.append(label)
        self.current_frame = 0

    def animate_loading(self):
        if self.is_processing:
            self.loading_frames[self.current_frame].place_forget()
            self.current_frame = (self.current_frame + 1) % len(self.loading_frames)
            self.loading_frames[self.current_frame].place(relx=0.5, rely=0.5, anchor='center')
            self.root.after(100, self.animate_loading)
        else:
            self.loading_frames[self.current_frame].place_forget()

    def start_generation(self):
        self.is_processing = True
        self.generate_btn.configure(state='disabled')
        self.play_btn.configure(state='disabled')
        self.pause_btn.configure(state='disabled')
        self.stop_btn.configure(state='disabled')
        self.save_btn.configure(state='disabled')
        self.animate_loading()
        
        thread = threading.Thread(target=self.generate_audio)
        thread.daemon = True
        thread.start()

    def generate_audio(self):
        text = self.text_input.get("1.0", tk.END).strip()
        if text:
            try:
                pipeline = KPipeline(lang_code=self.lang_code.get())
                all_audio = []
                text = gr_converter.convert_text(text)
                text = hy_converter.convert_text(text)
                print(text)
                generator = pipeline(
                    text, voice=self.voice.get(),
                    speed=self.speed.get(), split_pattern=r'\n+'
                )
                for _, _, audio in generator:
                    all_audio.append(audio)
                self.current_audio = np.concatenate(all_audio)
                self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
                sf.write(self.temp_file.name, self.current_audio, 24000)
            except Exception as e:
                print("Error", str(e))
                # self.root.after(0, lambda: messagebox.showerror("Error", str(e)))
        
        self.is_processing = False
        self.root.after(0, self.generation_complete)

    def generation_complete(self):
        self.generate_btn.configure(state='normal')
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='normal')
        self.stop_btn.configure(state='normal')
        self.save_btn.configure(state='normal')

    def play_audio(self):
        if self.current_audio is not None:
            self.audio_player.play_file(self.temp_file.name)
            self.play_btn.configure(state='disabled')
            self.pause_btn.configure(state='normal')
            self.stop_btn.configure(state='normal')

    def pause_audio(self):
        self.audio_player.pause()
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='disabled')

    def stop_audio(self):
        self.audio_player.stop()
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='disabled')
        self.stop_btn.configure(state='disabled')

    def save_audio(self):
        if self.current_audio is not None:
            sf.write('output_audio.wav', self.current_audio, 24000)
            messagebox.showinfo("Success", "Audio saved as 'output_audio.wav'")

    def __del__(self):
        if hasattr(self, 'audio_player'):
            self.audio_player.stop()

if __name__ == "__main__":
    root = tk.Tk()
    app = TTSApp(root)
    root.mainloop()

Sign up or log in to comment