From e59608928de74527b2ef0c3e1ca8d984f43eb3d0 Mon Sep 17 00:00:00 2001
From: Mahmoud Alismail <mahmoud.alismail@gmail.com>
Date: Tue, 8 Nov 2022 21:43:26 -0500
Subject: [PATCH] reorganizing the repo and updating the README

---
 CLAP_API/configs/__init__.py                  |   0
 README.md                                     | 125 +++---------------
 requirements.txt                              |   3 +-
 {CLAP_API => src}/CLAPWrapper.py              |   2 +-
 {CLAP_API => src}/__init__.py                 |   0
 {CLAP_API => src}/configs/config.yml          |   0
 .../examples/esc50_dataset.py                 |   0
 src/examples/zero_shot_classification.py      |  46 +++++++
 .../examples/zero_shot_predictions.py         |  45 +++++--
 {CLAP_API => src}/models/__init__.py          |   0
 {CLAP_API => src}/models/audio.py             |   0
 {CLAP_API => src}/models/clap.py              |   0
 {CLAP_API => src}/models/utils.py             |   0
 13 files changed, 103 insertions(+), 118 deletions(-)
 delete mode 100644 CLAP_API/configs/__init__.py
 rename {CLAP_API => src}/CLAPWrapper.py (99%)
 rename {CLAP_API => src}/__init__.py (100%)
 rename {CLAP_API => src}/configs/config.yml (100%)
 rename esc50_dataset.py => src/examples/esc50_dataset.py (100%)
 create mode 100644 src/examples/zero_shot_classification.py
 rename file_inference.py => src/examples/zero_shot_predictions.py (53%)
 rename {CLAP_API => src}/models/__init__.py (100%)
 rename {CLAP_API => src}/models/audio.py (100%)
 rename {CLAP_API => src}/models/clap.py (100%)
 rename {CLAP_API => src}/models/utils.py (100%)
diff --git a/CLAP_API/configs/__init__.py b/CLAP_API/configs/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/README.md b/README.md
index dbccf55..e1c84dd 100644
--- a/README.md
+++ b/README.md
@@ -4,26 +4,18 @@ CLAP (Contrastive Language-Audio Pretraining) is a neural network model that lea
 
 <img width="832" alt="clap_diagram_v3" src="https://user-images.githubusercontent.com/26778834/199842089-39ef6a2e-8abb-4338-bdfe-680abab70f53.png">
 
-## Citation
-https://arxiv.org/pdf/2206.04769.pdf
-```
-@article{elizalde2022clap,
-  title={Clap: Learning audio concepts from natural language supervision},
-  author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming},
-  journal={arXiv preprint arXiv:2206.04769},
-  year={2022}
-}
-```
-
 ## Setup
-- The setup assumes [Anaconda](https://www.anaconda.com) is installed
-- Open the anaconda terminal and follow the below commands. The symbol `{..}` indicates user input. 
+
+You are required to just install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
+
+If you have [conda](https://www.anaconda.com) installed, you can run the following: 
+
 ```shell
-> git clone https://github.com/microsoft/CLAP.git
-> cd CLAP
-> conda create -n clap python=3.8
-> conda activate clap
-> pip install -r requirements.txt
+git clone https://github.com/microsoft/CLAP.git && \
+cd CLAP && \
+conda create -n clap python=3.8 && \
+conda activate clap && \
+pip install -r requirements.txt
 ```
 
 ## CLAP weights
@@ -31,6 +23,9 @@ Request CLAP weights by filling this form: [link](https://forms.office.com/r/ULb
 
 
 ## Usage
+
+Please take a look at `src/examples` for usage examples. 
+
 - Load model
 ```python
 from CLAP_API import CLAP 
@@ -53,93 +48,15 @@ audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str])
 sim = clap_model.compute_similarity(audio_embeddings, text_embeddings)
 ```
 
-### Zero-Shot inference on an audio file from [ESC50 dataset](https://github.com/karolpiczak/ESC-50)
-
-```python
-from CLAP_API import CLAP
-from esc50_dataset import ESC50
-import time
-import torch.nn.functional as F
-
-# Load CLAP
-weights_path = 'best.pth' # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
-dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded
-audio_file, target, one_hot_target = dataset[1000]
-audio_file = [audio_file]
-prompt = 'this is a sound of '
-y = [prompt + x for x in dataset.classes]
-
-print('Computing text embeddings')
-text_embeddings = clap_model.get_text_embeddings(y)
-print('Computing audio embeddings')
-audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
-similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
-
-similarity = F.softmax(similarity, dim=1)
-values, indices = similarity[0].topk(5)
-# Print the result
-print("Ground Truth: {}".format(target))
-print("Top predictions:\n")
-for value, index in zip(values, indices):
-    print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
+## Citation
+https://arxiv.org/pdf/2206.04769.pdf
 ```
-
-The output (the exact numbers may vary):
-
-```
-Ground Truth: coughing
-Top predictions:
-
-        coughing: 86.34%
-        sneezing: 9.30%
-drinking sipping: 1.31%
-        laughing: 1.20%
-  glass breaking: 0.81%
-```
-
-### Zero-Shot Classification of [ESC50 dataset](https://github.com/karolpiczak/ESC-50) 
-
-```python
-from CLAP_API import CLAP
-from esc50_dataset import ESC50
-import torch.nn.functional as F
-import numpy as np
-from tqdm import tqdm
-from sklearn.metrics import accuracy_score
-
-# Load CLAP
-weights_path = # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
-dataset = ESC50(root='data', download=False)
-prompt = 'this is a sound of '
-Y = [prompt + x for x in dataset.classes]
-
-# Computing text embeddings
-text_embeddings = clap_model.get_text_embeddings(Y)
-
-# Computing audio embeddings
-y_preds, y_labels = [], []
-for i in tqdm(range(len(dataset))):
-    x, _, one_hot_target = dataset.__getitem__(i)
-    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
-    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
-    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
-    y_preds.append(y_pred)
-    y_labels.append(one_hot_target.detach().cpu().numpy())
-
-y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
-acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
-print('ESC50 Accuracy {}'.format(acc))
-```
-The output:
-
-```
-ESC50 Accuracy: 82.6%
+@article{elizalde2022clap,
+  title={Clap: Learning audio concepts from natural language supervision},
+  author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming},
+  journal={arXiv preprint arXiv:2206.04769},
+  year={2022}
+}
 ```
 
 ## Contributing
diff --git a/requirements.txt b/requirements.txt
index 033d3ff..00d0379 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -54,5 +54,4 @@ torchvision==0.9.1+cu111
 tqdm==4.60.0
 transformers==4.5.1
 typing-extensions==3.10.0.0
-urllib3==1.26.4
-wandb==0.10.28
\ No newline at end of file
+urllib3==1.26.4
\ No newline at end of file
diff --git a/CLAP_API/CLAPWrapper.py b/src/CLAPWrapper.py
similarity index 99%
rename from CLAP_API/CLAPWrapper.py
rename to src/CLAPWrapper.py
index dc76476..6900c3d 100644
--- a/CLAP_API/CLAPWrapper.py
+++ b/src/CLAPWrapper.py
@@ -12,7 +12,7 @@ import math
 import torchaudio.transforms as T
 import os
 import torch
-from importlib_resources import files, as_file
+from importlib_resources import files
 
 
 class CLAPWrapper():
diff --git a/CLAP_API/__init__.py b/src/__init__.py
similarity index 100%
rename from CLAP_API/__init__.py
rename to src/__init__.py
diff --git a/CLAP_API/configs/config.yml b/src/configs/config.yml
similarity index 100%
rename from CLAP_API/configs/config.yml
rename to src/configs/config.yml
diff --git a/esc50_dataset.py b/src/examples/esc50_dataset.py
similarity index 100%
rename from esc50_dataset.py
rename to src/examples/esc50_dataset.py
diff --git a/src/examples/zero_shot_classification.py b/src/examples/zero_shot_classification.py
new file mode 100644
index 0000000..130a1ef
--- /dev/null
+++ b/src/examples/zero_shot_classification.py
@@ -0,0 +1,46 @@
+"""
+This is an example using CLAP to perform zeroshot 
+    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from src.CLAPWrapper import CLAP
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+
+# Load dataset
+dataset = ESC50(root='data', download=False)
+prompt = 'this is a sound of '
+y = [prompt + x for x in dataset.classes]
+
+
+# Load and initialize CLAP
+weights_path = '<insert your weights file path>'
+clap_model = CLAP(weights_path, use_cuda=False)
+
+
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+    x, _, one_hot_target = dataset.__getitem__(i)
+    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+    y_preds.append(y_pred)
+    y_labels.append(one_hot_target.detach().cpu().numpy())
+
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+
+"""
+The output:
+
+ESC50 Accuracy: 82.6%
+
+"""
diff --git a/file_inference.py b/src/examples/zero_shot_predictions.py
similarity index 53%
rename from file_inference.py
rename to src/examples/zero_shot_predictions.py
index d1e5546..4333f51 100644
--- a/file_inference.py
+++ b/src/examples/zero_shot_predictions.py
@@ -1,29 +1,52 @@
-from CLAP_API import CLAP
+"""
+This is an example using CLAP for zero-shot 
+        inference using ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from src.CLAPWrapper import CLAP
 from esc50_dataset import ESC50
-import time
 import torch.nn.functional as F
 
-# Load CLAP
-weights_path = 'C:\\Users\\sdeshmukh\\Desktop\\CLAP_package\\model\\new\\best.pth' # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
+# Load ESC50 dataset
 dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded
 audio_file, target, one_hot_target = dataset[1000]
 audio_file = [audio_file]
 prompt = 'this is a sound of '
 y = [prompt + x for x in dataset.classes]
 
-print('Computing text embeddings')
+# Load and initialize CLAP
+weights_path = '<insert your weights file path>'
+
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAP(weights_path, use_cuda=False)
+
+# compute text embeddings from natural text 
 text_embeddings = clap_model.get_text_embeddings(y)
-print('Computing audio embeddings')
+
+# compute the audio embeddings from an audio file 
 audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
+
+# compute the similarity between audio_embeddings and text_embeddings
 similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
 
 similarity = F.softmax(similarity, dim=1)
 values, indices = similarity[0].topk(5)
-# Print the result
+
+# view the results
 print("Ground Truth: {}".format(target))
 print("Top predictions:\n")
 for value, index in zip(values, indices):
-    print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
\ No newline at end of file
+    print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
+
+"""
+The output (the exact numbers may vary):
+
+Ground Truth: coughing
+Top predictions:
+
+        coughing: 86.34%
+        sneezing: 9.30%
+drinking sipping: 1.31%
+        laughing: 1.20%
+  glass breaking: 0.81%
+"""
\ No newline at end of file
diff --git a/CLAP_API/models/__init__.py b/src/models/__init__.py
similarity index 100%
rename from CLAP_API/models/__init__.py
rename to src/models/__init__.py
diff --git a/CLAP_API/models/audio.py b/src/models/audio.py
similarity index 100%
rename from CLAP_API/models/audio.py
rename to src/models/audio.py
diff --git a/CLAP_API/models/clap.py b/src/models/clap.py
similarity index 100%
rename from CLAP_API/models/clap.py
rename to src/models/clap.py
diff --git a/CLAP_API/models/utils.py b/src/models/utils.py
similarity index 100%
rename from CLAP_API/models/utils.py
rename to src/models/utils.py