Spaces:

alaahilal
/

speechtotextt

Running

File size: 6,441 Bytes

import streamlit as st
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import wave
import numpy as np
import tempfile
import os

# Page configuration
st.set_page_config(
    page_title="Speech to Text Converter",
    page_icon="🎙️",
    layout="wide"
)

@st.cache_resource
def load_pipeline():
    """Load the model, processor, and create pipeline"""
    device = "cpu"
    torch_dtype = torch.float32
    model_id = "distil-whisper/distil-large-v3"
    
    # Load model
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True
    )
    model.to(device)
    
    # Load processor
    processor = AutoProcessor.from_pretrained(model_id)
    
    # Create pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=8,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

def read_wav_file(wav_file):
    """Read WAV file using wave library"""
    with wave.open(wav_file, 'rb') as wav:
        # Get wav file parameters
        channels = wav.getnchannels()
        sample_width = wav.getsampwidth()
        sample_rate = wav.getframerate()
        n_frames = wav.getnframes()
        
        # Read raw audio data
        raw_data = wav.readframes(n_frames)
        
        # Convert bytes to numpy array
        if sample_width == 1:
            dtype = np.uint8
        elif sample_width == 2:
            dtype = np.int16
        else:
            raise ValueError("Unsupported sample width")
        
        audio_data = np.frombuffer(raw_data, dtype=dtype)
        
        # Convert to float32 and normalize
        audio_data = audio_data.astype(np.float32) / np.iinfo(dtype).max
        
        # If stereo, convert to mono by averaging channels
        if channels == 2:
            audio_data = audio_data.reshape(-1, 2).mean(axis=1)
        
        # Resample to 16kHz if necessary
        if sample_rate != 16000:
            # Simple resampling
            original_length = len(audio_data)
            desired_length = int(original_length * 16000 / sample_rate)
            indices = np.linspace(0, original_length-1, desired_length)
            audio_data = np.interp(indices, np.arange(original_length), audio_data)
        
        return audio_data

def main():
    st.title("🎙️ Speech to Text Converter")
    st.markdown("### Upload a WAV file and convert speech to text")
    
    # Load pipeline
    with st.spinner("Loading model... This might take a few minutes the first time."):
        try:
            pipe = load_pipeline()
            st.success("Model loaded successfully! Ready to transcribe.")
        except Exception as e:
            st.error(f"Error loading model: {str(e)}")
            return

    # File upload
    audio_file = st.file_uploader(
        "Upload your audio file", 
        type=['wav'],
        help="Only WAV files are supported. For better performance, keep files under 5 minutes."
    )
    
    if audio_file is not None:
        # Create a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
            tmp_file.write(audio_file.getvalue())
            temp_path = tmp_file.name
            
        try:
            # Display audio player
            st.audio(audio_file)
            
            # Add transcribe button
            if st.button("🎯 Transcribe Audio", type="primary"):
                progress_bar = st.progress(0)
                status_text = st.empty()
                
                try:
                    # Read audio file
                    status_text.text("Reading audio file...")
                    progress_bar.progress(25)
                    audio_data = read_wav_file(temp_path)
                    
                    # Transcribe
                    status_text.text("Transcribing... This might take a while.")
                    progress_bar.progress(50)
                    
                    # Use pipeline for transcription
                    result = pipe(
                        {"raw": audio_data, "sampling_rate": 16000},
                        return_timestamps=True
                    )
                    
                    # Update progress
                    progress_bar.progress(100)
                    status_text.text("Transcription completed!")
                    
                    # Display results
                    st.markdown("### Transcription Result:")
                    st.write(result["text"])
                    
                    # Display timestamps if available
                    if "chunks" in result:
                        st.markdown("### Timestamps:")
                        for chunk in result["chunks"]:
                            st.write(f"{chunk['timestamp']}: {chunk['text']}")
                    
                    # Download button
                    st.download_button(
                        label="📥 Download Transcription",
                        data=result["text"],
                        file_name="transcription.txt",
                        mime="text/plain"
                    )
                    
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
                
        finally:
            # Clean up temporary file
            if os.path.exists(temp_path):
                os.remove(temp_path)
                
    # Usage instructions
    with st.expander("ℹ️ Usage Instructions"):
        st.markdown("""
        ### Instructions:
        1. Upload a WAV file (16-bit PCM format recommended)
        2. Click 'Transcribe Audio'
        3. Wait for processing to complete
        4. View or download the transcription
        
        ### Notes:
        - Only WAV files are supported
        - Keep files under 5 minutes for best results
        - Audio should be clear with minimal background noise
        - The transcription includes timestamps for better reference
        """)
    
    # Footer
    st.markdown("---")
    st.markdown(
        "Made with ❤️ using Distil-Whisper model"
    )

if __name__ == "__main__":
    main()