python - Issues Using Essentia Models For Music Tagging

BACKGROUNG:

I was using some models to generate tags for music such as genre, mood, and instruments in the music (audio file). The original models were in .pb extension. The models are available on .html and the models I am using are:

discogs-effnet-bs64-1
genre_discogs400-discogs-effnet-1
mtg_jamendo_instrument-discogs-effnet-1
mtg_jamendo_moodtheme-discogs-effnet-1

The input and outputs of the models are given in the respective json files which show the classes and the input/output sizes and names.

The default .pb models simply use the inbuilt functions:

from essentia.standard import (
    MonoLoader,
    TensorflowPredictEffnetDiscogs,
    TensorflowPredict2D,
)
def essentia_feature_extraction(audio_file, sample_rate):
    #Loading the audio file
    audio = MonoLoader(filename=audio_file, sampleRate=16000, resampleQuality=4)()

    # Embedding audio features
    embeddings = embedding_model(audio)

    result_dict = {}
    processed_labels = list(map(process_labels, genre_labels))
    # Genre prediction
    genre_predictions = genre_model(embeddings)
    result_dict["genres"] = filter_predictions(genre_predictions, processed_labels)
    # Mood/Theme prediction
    mood_predictions = mood_model(embeddings)
    result_dict["moods"] = filter_predictions(
        mood_predictions, mood_theme_classes, threshold=0.05
    )

    # Instrument prediction
    instrument_predictions = instrument_model(embeddings)
    result_dict["instruments"] = filter_predictions(
        instrument_predictions, instrument_classes
    )

    return result_dict

THE PROBLEM:

No matter what audio file I use as input, I consistently get the same output predictions for mood and instruments. The genre predictions are now usually all zero (meaning "unknown genre").

import librosa
import numpy as np
import tritonclient.http as httpclient

def essentia_feature_extraction_triton(audio_file, sample_rate):
    try:
        audio, sr = librosa.load(audio_file, sr=16000, mono=True)
        audio = audio.astype(np.float32)

        mel_spectrogram = librosa.feature.melspectrogram(
            y=audio, sr=16000, n_fft=2048, hop_length=512, n_mels=128
        )
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=1.0)

        if mel_spectrogram.shape[1] < 96:
            mel_spectrogram = np.pad(
                mel_spectrogram, ((0, 0), (0, 96 - mel_spectrogram.shape[1])), mode="constant"
            )
        elif mel_spectrogram.shape[1] > 96:
            mel_spectrogram = mel_spectrogram[:, :96]

        mel_spectrogram = np.expand_dims(mel_spectrogram, axis=0).astype(np.float32)


        with httpclient.InferenceServerClient(url=TRITON_URL) as triton_client:
            # --- EFFNET DISCOGS (Combined Model) ---
            input_name = "melspectrogram"
            genre_output_name = "activations"
            embedding_output_name = "embeddings"

            inputs = [httpclient.InferInput(input_name, mel_spectrogram.shape, "FP32")]
            inputs[0].set_data_from_numpy(mel_spectrogram)

            outputs = [
                httpclient.InferRequestedOutput(genre_output_name),
                httpclient.InferRequestedOutput(embedding_output_name)
            ]

            results = triton_client.infer(
                model_name=EFFNET_DISCOGS_MODEL_NAME, inputs=inputs, outputs=outputs
            )

            genre_predictions = results.as_numpy(genre_output_name)
            embeddings = results.as_numpy(embedding_output_name)
            embeddings = embeddings.astype(np.float32)

            # --- MOOD PREDICTION ---
            input_name = "embeddings"
            output_name = "activations"
            inputs = [httpclient.InferInput(input_name, embeddings.shape, "FP32")]
            inputs[0].set_data_from_numpy(embeddings)

            outputs = [httpclient.InferRequestedOutput(output_name)]
            mood_predictions = triton_client.infer(
                model_name=MOOD_MODEL_NAME, inputs=inputs, outputs=outputs
            ).as_numpy(output_name)

            # --- INSTRUMENT PREDICTION ---
            input_name = "embeddings"
            output_name = "activations"
            inputs = [httpclient.InferInput(input_name, embeddings.shape, "FP32")]
            inputs[0].set_data_from_numpy(embeddings)

            outputs = [httpclient.InferRequestedOutput(output_name)]
            instrument_predictions = triton_client.infer(
                model_name=INSTRUMENT_MODEL_NAME, inputs=inputs, outputs=outputs
            ).as_numpy(output_name)

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Issues Using Essentia Models For Music Tagging - Stack Overflow

与本文相关的文章

评论列表(0)