seniruk/sinscribe-sinhala-stt
Updated โข 20 โข 1
How to use seniruk/whisper-small-si-cpu with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="seniruk/whisper-small-si-cpu") # Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
processor = AutoProcessor.from_pretrained("seniruk/whisper-small-si-cpu")
model = AutoModelForSpeechSeq2Seq.from_pretrained("seniruk/whisper-small-si-cpu")Iโm a AI undergraduate and an AI enthusiast, working on machine learning projects and open-source contributions.
I enjoy exploring AI pipelines, natural language processing, and building tools that make development easier.
This model is a fine-tuned version of openai/whisper-small on the Sinhala CSV + FLACs dataset. It achieves the following results on the evaluation set:
Can be used for Sinhala speech to text conversions. Make sure to input noise low audio to the model, to get the best outcome.
Trained on the custom dataset - seniruk/sinscribe-sinhala-stt
Trained on above final dataset with 2 epochs on a device with below spec for 41:00:59 hours
| Training Loss | Epoch | Step | Validation Loss | Wer |
|---|---|---|---|---|
| 0.1871 | 0.1102 | 1000 | 0.1834 | 51.9170 |
| 0.1429 | 0.2204 | 2000 | 0.1517 | 44.7541 |
| 0.1345 | 0.3307 | 3000 | 0.1336 | 41.0627 |
| 0.1183 | 0.4409 | 4000 | 0.1237 | 38.6625 |
| 0.114 | 0.5511 | 5000 | 0.1151 | 36.9654 |
| 0.1056 | 0.6613 | 6000 | 0.1080 | 35.2670 |
| 0.0968 | 0.7715 | 7000 | 0.1037 | 34.4457 |
| 0.1011 | 0.8817 | 8000 | 0.0986 | 33.2741 |
| 0.0971 | 0.9920 | 9000 | 0.0961 | 32.7147 |
| 0.0713 | 1.1022 | 10000 | 0.0947 | 32.0250 |
| 0.0706 | 1.2124 | 11000 | 0.0940 | 32.0766 |
| 0.0691 | 1.3226 | 12000 | 0.0907 | 31.2485 |
| 0.0684 | 1.4328 | 13000 | 0.0893 | 30.9512 |
| 0.0718 | 1.5430 | 14000 | 0.0875 | 30.3592 |
| 0.0642 | 1.6533 | 15000 | 0.0859 | 30.0388 |
| 0.0667 | 1.7635 | 16000 | 0.0842 | 29.5840 |
| 0.0667 | 1.8737 | 17000 | 0.0835 | 29.3193 |
| 0.0677 | 1.9839 | 18000 | 0.0829 | 29.1387 |
import torch
import soundfile
import torchaudio
from transformers import WhisperForConditionalGeneration, WhisperProcessor
device = "cpu"
torchaudio.set_audio_backend("soundfile")
model = WhisperForConditionalGeneration.from_pretrained("seniruk/whisper-small-si-cpu").to(device)
processor = WhisperProcessor.from_pretrained("seniruk/whisper-small-si-cpu")
def transcribe(audio_path):
try:
if audio_path is None:
return "No audio received. Please record something."
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
sample_rate = 16000
waveform = waveform.squeeze().numpy()
inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to("cpu")
predicted_ids = model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
except Exception as e:
return f"Error during transcription: {e}"
print(transcribe('audio.wav'))
import torch
import soundfile
import torchaudio
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import gradio as gr
device = "cpu"
torchaudio.set_audio_backend("soundfile")
model = WhisperForConditionalGeneration.from_pretrained("seniruk/whisper-small-si-cpu").to(device)
processor = WhisperProcessor.from_pretrained("seniruk/whisper-small-si-cpu")
MAX_DURATION_SECONDS = 30 # Limit: 30 seconds of audio
def transcribe(audio_path):
try:
if audio_path is None:
return "No audio received. Please record or upload a file."
# Load audio
waveform, sample_rate = torchaudio.load(audio_path)
# Convert to mono
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Duration check
duration = waveform.shape[1] / sample_rate
if duration > MAX_DURATION_SECONDS:
return f"Audio too long ({duration:.1f}s). Please use an audio clip shorter than {MAX_DURATION_SECONDS}s."
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
sample_rate = 16000
waveform = waveform.squeeze().numpy()
# Process through Whisper
inputs = processor(
waveform,
sampling_rate=sample_rate,
return_tensors="pt"
).input_features.to(device)
with torch.no_grad():
predicted_ids = model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
except Exception as e:
return f"Error during transcription: {e}"
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio"),
outputs=gr.Textbox(label="Transcription"),
title="Whisper Small Sinhala (CPU)",
description=(
"๐๏ธ Sinhala speech-to-text using the fine-tuned Whisper Small model (Sinscribe).\n"
"You can record or upload audio up to 30 seconds long."
),
)
iface.launch()
Base model
openai/whisper-small