speech2text-fa/main.py
2023-12-10 20:54:33 +03:30

40 lines
1.1 KiB
Python

import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# MODEL = "m3hrdadfi/wav2vec2-large-xlsr-persian"
MODEL = "/home/reza/data/huggingface-models/04.wav2vec2-large-xlsr-persian"
def mp3_to_text(mp3_file_path):
# Load the MP3 file and resample to 16kHz
audio, sample_rate = librosa.load(mp3_file_path, sr=16000)
print()
print("Resampling is Done!")
# Load tokenizer and model from Hugging Face
tokenizer = Wav2Vec2Processor.from_pretrained(MODEL)
model = Wav2Vec2ForCTC.from_pretrained(MODEL)
print()
print("Loading model is Done!")
# Preprocess the audio
input_values = tokenizer(audio, sampling_rate=16000, return_tensors="pt", padding="longest").input_values
logits = model(input_values).logits
print()
print("Processing the audio is Done!")
# Decode the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
print()
print("Decoding the prodicted IDs is Done!")
return transcription[0]
# text = mp3_to_text("samples/captcha.mp3")
text = mp3_to_text("samples/sample1.wav")
print()
print(text)