Spaces:
Running
Running
File size: 6,441 Bytes
9b3b65a 4af3d61 9b3b65a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import streamlit as st
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import wave
import numpy as np
import tempfile
import os
# Page configuration
st.set_page_config(
page_title="Speech to Text Converter",
page_icon="๐๏ธ",
layout="wide"
)
@st.cache_resource
def load_pipeline():
"""Load the model, processor, and create pipeline"""
device = "cpu"
torch_dtype = torch.float32
model_id = "distil-whisper/distil-large-v3"
# Load model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
# Load processor
processor = AutoProcessor.from_pretrained(model_id)
# Create pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
torch_dtype=torch_dtype,
device=device,
)
return pipe
def read_wav_file(wav_file):
"""Read WAV file using wave library"""
with wave.open(wav_file, 'rb') as wav:
# Get wav file parameters
channels = wav.getnchannels()
sample_width = wav.getsampwidth()
sample_rate = wav.getframerate()
n_frames = wav.getnframes()
# Read raw audio data
raw_data = wav.readframes(n_frames)
# Convert bytes to numpy array
if sample_width == 1:
dtype = np.uint8
elif sample_width == 2:
dtype = np.int16
else:
raise ValueError("Unsupported sample width")
audio_data = np.frombuffer(raw_data, dtype=dtype)
# Convert to float32 and normalize
audio_data = audio_data.astype(np.float32) / np.iinfo(dtype).max
# If stereo, convert to mono by averaging channels
if channels == 2:
audio_data = audio_data.reshape(-1, 2).mean(axis=1)
# Resample to 16kHz if necessary
if sample_rate != 16000:
# Simple resampling
original_length = len(audio_data)
desired_length = int(original_length * 16000 / sample_rate)
indices = np.linspace(0, original_length-1, desired_length)
audio_data = np.interp(indices, np.arange(original_length), audio_data)
return audio_data
def main():
st.title("๐๏ธ Speech to Text Converter")
st.markdown("### Upload a WAV file and convert speech to text")
# Load pipeline
with st.spinner("Loading model... This might take a few minutes the first time."):
try:
pipe = load_pipeline()
st.success("Model loaded successfully! Ready to transcribe.")
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return
# File upload
audio_file = st.file_uploader(
"Upload your audio file",
type=['wav'],
help="Only WAV files are supported. For better performance, keep files under 5 minutes."
)
if audio_file is not None:
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
tmp_file.write(audio_file.getvalue())
temp_path = tmp_file.name
try:
# Display audio player
st.audio(audio_file)
# Add transcribe button
if st.button("๐ฏ Transcribe Audio", type="primary"):
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Read audio file
status_text.text("Reading audio file...")
progress_bar.progress(25)
audio_data = read_wav_file(temp_path)
# Transcribe
status_text.text("Transcribing... This might take a while.")
progress_bar.progress(50)
# Use pipeline for transcription
result = pipe(
{"raw": audio_data, "sampling_rate": 16000},
return_timestamps=True
)
# Update progress
progress_bar.progress(100)
status_text.text("Transcription completed!")
# Display results
st.markdown("### Transcription Result:")
st.write(result["text"])
# Display timestamps if available
if "chunks" in result:
st.markdown("### Timestamps:")
for chunk in result["chunks"]:
st.write(f"{chunk['timestamp']}: {chunk['text']}")
# Download button
st.download_button(
label="๐ฅ Download Transcription",
data=result["text"],
file_name="transcription.txt",
mime="text/plain"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
finally:
# Clean up temporary file
if os.path.exists(temp_path):
os.remove(temp_path)
# Usage instructions
with st.expander("โน๏ธ Usage Instructions"):
st.markdown("""
### Instructions:
1. Upload a WAV file (16-bit PCM format recommended)
2. Click 'Transcribe Audio'
3. Wait for processing to complete
4. View or download the transcription
### Notes:
- Only WAV files are supported
- Keep files under 5 minutes for best results
- Audio should be clear with minimal background noise
- The transcription includes timestamps for better reference
""")
# Footer
st.markdown("---")
st.markdown(
"Made with โค๏ธ using Distil-Whisper model"
)
if __name__ == "__main__":
main() |