import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # MODEL = "m3hrdadfi/wav2vec2-large-xlsr-persian" MODEL = "/home/reza/data/huggingface-models/04.wav2vec2-large-xlsr-persian" def mp3_to_text(mp3_file_path): # Load the MP3 file and resample to 16kHz audio, sample_rate = librosa.load(mp3_file_path, sr=16000) print() print("Resampling is Done!") # Load tokenizer and model from Hugging Face tokenizer = Wav2Vec2Processor.from_pretrained(MODEL) model = Wav2Vec2ForCTC.from_pretrained(MODEL) print() print("Loading model is Done!") # Preprocess the audio input_values = tokenizer(audio, sampling_rate=16000, return_tensors="pt", padding="longest").input_values logits = model(input_values).logits print() print("Processing the audio is Done!") # Decode the predicted IDs predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids) print() print("Decoding the prodicted IDs is Done!") return transcription[0] # text = mp3_to_text("samples/captcha.mp3") text = mp3_to_text("samples/sample1.wav") print() print(text)