The GPU backend has about a 5 sec load that the CPU doesn’t.
Plus yeah Python isn’t the fastest and it really sucks when iterating DSP like data.
I hacked the code so the model stays in memory and it runs twice and that takes off quite a bit, well the 5 sec delay that seems constant.
The code and model is actually pretty terrible but it was just purely testing ArmNN I should of done the same with the mfcc as load once run twice.
# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
"""Automatic speech recognition with PyArmNN demo for processing audio clips to text."""
import sys
import os
import numpy as np
import psutil
script_dir = os.path.dirname(__file__)
sys.path.insert(1, os.path.join(script_dir, '..', 'common'))
from argparse import ArgumentParser
from network_executor import ArmnnNetworkExecutor
from utils import prepare_input_data
from audio_capture import AudioCaptureParams, capture_audio
from audio_utils import decode_text, display_text
from wav2letter_mfcc import Wav2LetterMFCC, W2LAudioPreprocessor
from mfcc import MFCCParams
from datetime import datetime
# Model Specific Labels
labels = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm',
13: 'n',
14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y',
25: 'z',
26: "'", 27: ' ', 28: '$'}
def parse_args():
parser = ArgumentParser(description="ASR with PyArmNN")
parser.add_argument(
"--audio_file_path",
required=True,
type=str,
help="Path to the audio file to perform ASR",
)
parser.add_argument(
"--model_file_path",
required=True,
type=str,
help="Path to ASR model to use",
)
parser.add_argument(
"--preferred_backends",
type=str,
nargs="+",
default=["GpuAcc", "CpuAcc", "CpuRef"],
help="""List of backends in order of preference for optimizing
subgraphs, falling back to the next backend in the list on unsupported
layers. Defaults to [GpuAcc, CpuAcc, CpuRef]""",
)
return parser.parse_args()
def main(args, network):
# Read command line args
audio_file = args.audio_file_path
print(datetime.now() - starttime, psutil.cpu_percent())
print(datetime.now() - starttime, psutil.cpu_percent())
# Specify model specific audio data requirements
audio_capture_params = AudioCaptureParams(dtype=np.float32, overlap=31712, min_samples=47712, sampling_freq=16000,
mono=True)
buffer = capture_audio(audio_file, audio_capture_params)
print(datetime.now() - starttime, psutil.cpu_percent())
# Extract features and create the preprocessor
mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000,
num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_fft=512)
print(datetime.now() - starttime, psutil.cpu_percent())
wmfcc = Wav2LetterMFCC(mfcc_params)
preprocessor = W2LAudioPreprocessor(wmfcc, model_input_size=296, stride=160)
current_r_context = ""
is_first_window = True
print("Processing Audio Frames...")
for audio_data in buffer:
# Prepare the input Tensors
input_data = prepare_input_data(audio_data, network.get_data_type(), network.get_input_quantization_scale(0),
network.get_input_quantization_offset(0), preprocessor)
# Run inference
output_result = network.run([input_data])
# Slice and Decode the text, and store the right context
current_r_context, text = decode_text(is_first_window, labels, output_result)
is_first_window = False
display_text(text)
print(datetime.now() - starttime, psutil.cpu_percent())
print(current_r_context, flush=True)
print(datetime.now() - starttime, psutil.cpu_percent())
print("Inference End", psutil.cpu_percent())
if __name__ == "__main__":
args = parse_args()
print("Inference Start", psutil.cpu_percent())
starttime = datetime.now()
# Create the ArmNN inference runner
network = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends)
print(datetime.now() - starttime, psutil.cpu_percent())
main(args, network)
starttime = datetime.now()
print(datetime.now() - starttime, psutil.cpu_percent())
main(args, network)
I was using these as longer but not so long from https://github.com/ggerganov/whisper.cpp which is a cpu version of OPenAI’s Whisper based on his own tensor lib which is interesting
#
# Audio samples
#
# download a few audio samples into folder "./samples":
.PHONY: samples
samples:
@echo "Downloading samples..."
@mkdir -p samples
@wget --quiet --show-progress -O samples/gb0.ogg https://upload.wikimedia.org/wikipedia/commons/2/22/George_W._Bush%27s_weekly_radio_address_%28November_1%2C_2008%29.oga
@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
@echo "Converting to 16-bit WAV ..."
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
@rm samples/mm1.wav
I will give them another go even though the model we have is pretty bad https://github.com/breizhn/DTLN might be better as its also 2 models and that would be really interesting to use GpuAcc on one and CpuAcc on another but from the load it looks almost like GpuAcc is more of a helper than removing most load from Cpu such as Cuda.
You should also be able to take a single big model and partition the layers and run with 2x delegates which is what I was wondering.
Its a shame RkNN didn’t go the delegate route.
I was lazy as it chunks the audio through the mfcc and very possible to just convert the whole audio and chunk the premade mfcc through the model, but didn’t bother.
May do as the interest is purely in the delegate and gpu vs cpu not some horrid mfcc.
Its strange as tensorflow has mfcc ops and you can subclass them into a model as does https://github.com/google-research/google-research/tree/master/kws_streaming or even librosa is more performant than the armnn example
Thanks though as now it makes so much sense why the cpu is being hit.