From e59608928de74527b2ef0c3e1ca8d984f43eb3d0 Mon Sep 17 00:00:00 2001 From: Mahmoud Alismail Date: Tue, 8 Nov 2022 21:43:26 -0500 Subject: [PATCH] reorganizing the repo and updating the README --- CLAP_API/configs/__init__.py | 0 README.md | 125 +++--------------- requirements.txt | 3 +- {CLAP_API => src}/CLAPWrapper.py | 2 +- {CLAP_API => src}/__init__.py | 0 {CLAP_API => src}/configs/config.yml | 0 .../examples/esc50_dataset.py | 0 src/examples/zero_shot_classification.py | 46 +++++++ .../examples/zero_shot_predictions.py | 45 +++++-- {CLAP_API => src}/models/__init__.py | 0 {CLAP_API => src}/models/audio.py | 0 {CLAP_API => src}/models/clap.py | 0 {CLAP_API => src}/models/utils.py | 0 13 files changed, 103 insertions(+), 118 deletions(-) delete mode 100644 CLAP_API/configs/__init__.py rename {CLAP_API => src}/CLAPWrapper.py (99%) rename {CLAP_API => src}/__init__.py (100%) rename {CLAP_API => src}/configs/config.yml (100%) rename esc50_dataset.py => src/examples/esc50_dataset.py (100%) create mode 100644 src/examples/zero_shot_classification.py rename file_inference.py => src/examples/zero_shot_predictions.py (53%) rename {CLAP_API => src}/models/__init__.py (100%) rename {CLAP_API => src}/models/audio.py (100%) rename {CLAP_API => src}/models/clap.py (100%) rename {CLAP_API => src}/models/utils.py (100%) diff --git a/CLAP_API/configs/__init__.py b/CLAP_API/configs/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/README.md b/README.md index dbccf55..e1c84dd 100644 --- a/README.md +++ b/README.md @@ -4,26 +4,18 @@ CLAP (Contrastive Language-Audio Pretraining) is a neural network model that lea clap_diagram_v3 -## Citation -https://arxiv.org/pdf/2206.04769.pdf -``` -@article{elizalde2022clap, - title={Clap: Learning audio concepts from natural language supervision}, - author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming}, - journal={arXiv preprint arXiv:2206.04769}, - year={2022} -} -``` - ## Setup -- The setup assumes [Anaconda](https://www.anaconda.com) is installed -- Open the anaconda terminal and follow the below commands. The symbol `{..}` indicates user input. + +You are required to just install the dependencies: `pip install -r requirements.txt` using Python 3 to get started. + +If you have [conda](https://www.anaconda.com) installed, you can run the following: + ```shell -> git clone https://github.com/microsoft/CLAP.git -> cd CLAP -> conda create -n clap python=3.8 -> conda activate clap -> pip install -r requirements.txt +git clone https://github.com/microsoft/CLAP.git && \ +cd CLAP && \ +conda create -n clap python=3.8 && \ +conda activate clap && \ +pip install -r requirements.txt ``` ## CLAP weights @@ -31,6 +23,9 @@ Request CLAP weights by filling this form: [link](https://forms.office.com/r/ULb ## Usage + +Please take a look at `src/examples` for usage examples. + - Load model ```python from CLAP_API import CLAP @@ -53,93 +48,15 @@ audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str]) sim = clap_model.compute_similarity(audio_embeddings, text_embeddings) ``` -### Zero-Shot inference on an audio file from [ESC50 dataset](https://github.com/karolpiczak/ESC-50) - -```python -from CLAP_API import CLAP -from esc50_dataset import ESC50 -import time -import torch.nn.functional as F - -# Load CLAP -weights_path = 'best.pth' # Add weight path here -clap_model = CLAP(weights_path, use_cuda=False) - -# Load dataset -dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded -audio_file, target, one_hot_target = dataset[1000] -audio_file = [audio_file] -prompt = 'this is a sound of ' -y = [prompt + x for x in dataset.classes] - -print('Computing text embeddings') -text_embeddings = clap_model.get_text_embeddings(y) -print('Computing audio embeddings') -audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True) -similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) - -similarity = F.softmax(similarity, dim=1) -values, indices = similarity[0].topk(5) -# Print the result -print("Ground Truth: {}".format(target)) -print("Top predictions:\n") -for value, index in zip(values, indices): - print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%") +## Citation +https://arxiv.org/pdf/2206.04769.pdf ``` - -The output (the exact numbers may vary): - -``` -Ground Truth: coughing -Top predictions: - - coughing: 86.34% - sneezing: 9.30% -drinking sipping: 1.31% - laughing: 1.20% - glass breaking: 0.81% -``` - -### Zero-Shot Classification of [ESC50 dataset](https://github.com/karolpiczak/ESC-50) - -```python -from CLAP_API import CLAP -from esc50_dataset import ESC50 -import torch.nn.functional as F -import numpy as np -from tqdm import tqdm -from sklearn.metrics import accuracy_score - -# Load CLAP -weights_path = # Add weight path here -clap_model = CLAP(weights_path, use_cuda=False) - -# Load dataset -dataset = ESC50(root='data', download=False) -prompt = 'this is a sound of ' -Y = [prompt + x for x in dataset.classes] - -# Computing text embeddings -text_embeddings = clap_model.get_text_embeddings(Y) - -# Computing audio embeddings -y_preds, y_labels = [], [] -for i in tqdm(range(len(dataset))): - x, _, one_hot_target = dataset.__getitem__(i) - audio_embeddings = clap_model.get_audio_embeddings([x], resample=True) - similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) - y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy() - y_preds.append(y_pred) - y_labels.append(one_hot_target.detach().cpu().numpy()) - -y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0) -acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1)) -print('ESC50 Accuracy {}'.format(acc)) -``` -The output: - -``` -ESC50 Accuracy: 82.6% +@article{elizalde2022clap, + title={Clap: Learning audio concepts from natural language supervision}, + author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming}, + journal={arXiv preprint arXiv:2206.04769}, + year={2022} +} ``` ## Contributing diff --git a/requirements.txt b/requirements.txt index 033d3ff..00d0379 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,5 +54,4 @@ torchvision==0.9.1+cu111 tqdm==4.60.0 transformers==4.5.1 typing-extensions==3.10.0.0 -urllib3==1.26.4 -wandb==0.10.28 \ No newline at end of file +urllib3==1.26.4 \ No newline at end of file diff --git a/CLAP_API/CLAPWrapper.py b/src/CLAPWrapper.py similarity index 99% rename from CLAP_API/CLAPWrapper.py rename to src/CLAPWrapper.py index dc76476..6900c3d 100644 --- a/CLAP_API/CLAPWrapper.py +++ b/src/CLAPWrapper.py @@ -12,7 +12,7 @@ import math import torchaudio.transforms as T import os import torch -from importlib_resources import files, as_file +from importlib_resources import files class CLAPWrapper(): diff --git a/CLAP_API/__init__.py b/src/__init__.py similarity index 100% rename from CLAP_API/__init__.py rename to src/__init__.py diff --git a/CLAP_API/configs/config.yml b/src/configs/config.yml similarity index 100% rename from CLAP_API/configs/config.yml rename to src/configs/config.yml diff --git a/esc50_dataset.py b/src/examples/esc50_dataset.py similarity index 100% rename from esc50_dataset.py rename to src/examples/esc50_dataset.py diff --git a/src/examples/zero_shot_classification.py b/src/examples/zero_shot_classification.py new file mode 100644 index 0000000..130a1ef --- /dev/null +++ b/src/examples/zero_shot_classification.py @@ -0,0 +1,46 @@ +""" +This is an example using CLAP to perform zeroshot + classification on ESC50 (https://github.com/karolpiczak/ESC-50). +""" + +from src.CLAPWrapper import CLAP +from esc50_dataset import ESC50 +import torch.nn.functional as F +import numpy as np +from tqdm import tqdm +from sklearn.metrics import accuracy_score + +# Load dataset +dataset = ESC50(root='data', download=False) +prompt = 'this is a sound of ' +y = [prompt + x for x in dataset.classes] + + +# Load and initialize CLAP +weights_path = '' +clap_model = CLAP(weights_path, use_cuda=False) + + +# Computing text embeddings +text_embeddings = clap_model.get_text_embeddings(y) + +# Computing audio embeddings +y_preds, y_labels = [], [] +for i in tqdm(range(len(dataset))): + x, _, one_hot_target = dataset.__getitem__(i) + audio_embeddings = clap_model.get_audio_embeddings([x], resample=True) + similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) + y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy() + y_preds.append(y_pred) + y_labels.append(one_hot_target.detach().cpu().numpy()) + +y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0) +acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1)) +print('ESC50 Accuracy {}'.format(acc)) + +""" +The output: + +ESC50 Accuracy: 82.6% + +""" diff --git a/file_inference.py b/src/examples/zero_shot_predictions.py similarity index 53% rename from file_inference.py rename to src/examples/zero_shot_predictions.py index d1e5546..4333f51 100644 --- a/file_inference.py +++ b/src/examples/zero_shot_predictions.py @@ -1,29 +1,52 @@ -from CLAP_API import CLAP +""" +This is an example using CLAP for zero-shot + inference using ESC50 (https://github.com/karolpiczak/ESC-50). +""" + +from src.CLAPWrapper import CLAP from esc50_dataset import ESC50 -import time import torch.nn.functional as F -# Load CLAP -weights_path = 'C:\\Users\\sdeshmukh\\Desktop\\CLAP_package\\model\\new\\best.pth' # Add weight path here -clap_model = CLAP(weights_path, use_cuda=False) - -# Load dataset +# Load ESC50 dataset dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded audio_file, target, one_hot_target = dataset[1000] audio_file = [audio_file] prompt = 'this is a sound of ' y = [prompt + x for x in dataset.classes] -print('Computing text embeddings') +# Load and initialize CLAP +weights_path = '' + +# Setting use_cuda = True will load the model on a GPU using CUDA +clap_model = CLAP(weights_path, use_cuda=False) + +# compute text embeddings from natural text text_embeddings = clap_model.get_text_embeddings(y) -print('Computing audio embeddings') + +# compute the audio embeddings from an audio file audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True) + +# compute the similarity between audio_embeddings and text_embeddings similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) similarity = F.softmax(similarity, dim=1) values, indices = similarity[0].topk(5) -# Print the result + +# view the results print("Ground Truth: {}".format(target)) print("Top predictions:\n") for value, index in zip(values, indices): - print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%") \ No newline at end of file + print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%") + +""" +The output (the exact numbers may vary): + +Ground Truth: coughing +Top predictions: + + coughing: 86.34% + sneezing: 9.30% +drinking sipping: 1.31% + laughing: 1.20% + glass breaking: 0.81% +""" \ No newline at end of file diff --git a/CLAP_API/models/__init__.py b/src/models/__init__.py similarity index 100% rename from CLAP_API/models/__init__.py rename to src/models/__init__.py diff --git a/CLAP_API/models/audio.py b/src/models/audio.py similarity index 100% rename from CLAP_API/models/audio.py rename to src/models/audio.py diff --git a/CLAP_API/models/clap.py b/src/models/clap.py similarity index 100% rename from CLAP_API/models/clap.py rename to src/models/clap.py diff --git a/CLAP_API/models/utils.py b/src/models/utils.py similarity index 100% rename from CLAP_API/models/utils.py rename to src/models/utils.py