diff --git a/CLAP_API/configs/__init__.py b/CLAP_API/configs/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/README.md b/README.md
index dbccf55..e1c84dd 100644
--- a/README.md
+++ b/README.md
@@ -4,26 +4,18 @@ CLAP (Contrastive Language-Audio Pretraining) is a neural network model that lea
-## Citation
-https://arxiv.org/pdf/2206.04769.pdf
-```
-@article{elizalde2022clap,
- title={Clap: Learning audio concepts from natural language supervision},
- author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming},
- journal={arXiv preprint arXiv:2206.04769},
- year={2022}
-}
-```
-
## Setup
-- The setup assumes [Anaconda](https://www.anaconda.com) is installed
-- Open the anaconda terminal and follow the below commands. The symbol `{..}` indicates user input.
+
+You are required to just install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
+
+If you have [conda](https://www.anaconda.com) installed, you can run the following:
+
```shell
-> git clone https://github.com/microsoft/CLAP.git
-> cd CLAP
-> conda create -n clap python=3.8
-> conda activate clap
-> pip install -r requirements.txt
+git clone https://github.com/microsoft/CLAP.git && \
+cd CLAP && \
+conda create -n clap python=3.8 && \
+conda activate clap && \
+pip install -r requirements.txt
```
## CLAP weights
@@ -31,6 +23,9 @@ Request CLAP weights by filling this form: [link](https://forms.office.com/r/ULb
## Usage
+
+Please take a look at `src/examples` for usage examples.
+
- Load model
```python
from CLAP_API import CLAP
@@ -53,93 +48,15 @@ audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str])
sim = clap_model.compute_similarity(audio_embeddings, text_embeddings)
```
-### Zero-Shot inference on an audio file from [ESC50 dataset](https://github.com/karolpiczak/ESC-50)
-
-```python
-from CLAP_API import CLAP
-from esc50_dataset import ESC50
-import time
-import torch.nn.functional as F
-
-# Load CLAP
-weights_path = 'best.pth' # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
-dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded
-audio_file, target, one_hot_target = dataset[1000]
-audio_file = [audio_file]
-prompt = 'this is a sound of '
-y = [prompt + x for x in dataset.classes]
-
-print('Computing text embeddings')
-text_embeddings = clap_model.get_text_embeddings(y)
-print('Computing audio embeddings')
-audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
-similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
-
-similarity = F.softmax(similarity, dim=1)
-values, indices = similarity[0].topk(5)
-# Print the result
-print("Ground Truth: {}".format(target))
-print("Top predictions:\n")
-for value, index in zip(values, indices):
- print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
+## Citation
+https://arxiv.org/pdf/2206.04769.pdf
```
-
-The output (the exact numbers may vary):
-
-```
-Ground Truth: coughing
-Top predictions:
-
- coughing: 86.34%
- sneezing: 9.30%
-drinking sipping: 1.31%
- laughing: 1.20%
- glass breaking: 0.81%
-```
-
-### Zero-Shot Classification of [ESC50 dataset](https://github.com/karolpiczak/ESC-50)
-
-```python
-from CLAP_API import CLAP
-from esc50_dataset import ESC50
-import torch.nn.functional as F
-import numpy as np
-from tqdm import tqdm
-from sklearn.metrics import accuracy_score
-
-# Load CLAP
-weights_path = # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
-dataset = ESC50(root='data', download=False)
-prompt = 'this is a sound of '
-Y = [prompt + x for x in dataset.classes]
-
-# Computing text embeddings
-text_embeddings = clap_model.get_text_embeddings(Y)
-
-# Computing audio embeddings
-y_preds, y_labels = [], []
-for i in tqdm(range(len(dataset))):
- x, _, one_hot_target = dataset.__getitem__(i)
- audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
- similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
- y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
- y_preds.append(y_pred)
- y_labels.append(one_hot_target.detach().cpu().numpy())
-
-y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
-acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
-print('ESC50 Accuracy {}'.format(acc))
-```
-The output:
-
-```
-ESC50 Accuracy: 82.6%
+@article{elizalde2022clap,
+ title={Clap: Learning audio concepts from natural language supervision},
+ author={Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming},
+ journal={arXiv preprint arXiv:2206.04769},
+ year={2022}
+}
```
## Contributing
diff --git a/requirements.txt b/requirements.txt
index 033d3ff..00d0379 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -54,5 +54,4 @@ torchvision==0.9.1+cu111
tqdm==4.60.0
transformers==4.5.1
typing-extensions==3.10.0.0
-urllib3==1.26.4
-wandb==0.10.28
\ No newline at end of file
+urllib3==1.26.4
\ No newline at end of file
diff --git a/CLAP_API/CLAPWrapper.py b/src/CLAPWrapper.py
similarity index 99%
rename from CLAP_API/CLAPWrapper.py
rename to src/CLAPWrapper.py
index dc76476..6900c3d 100644
--- a/CLAP_API/CLAPWrapper.py
+++ b/src/CLAPWrapper.py
@@ -12,7 +12,7 @@ import math
import torchaudio.transforms as T
import os
import torch
-from importlib_resources import files, as_file
+from importlib_resources import files
class CLAPWrapper():
diff --git a/CLAP_API/__init__.py b/src/__init__.py
similarity index 100%
rename from CLAP_API/__init__.py
rename to src/__init__.py
diff --git a/CLAP_API/configs/config.yml b/src/configs/config.yml
similarity index 100%
rename from CLAP_API/configs/config.yml
rename to src/configs/config.yml
diff --git a/esc50_dataset.py b/src/examples/esc50_dataset.py
similarity index 100%
rename from esc50_dataset.py
rename to src/examples/esc50_dataset.py
diff --git a/src/examples/zero_shot_classification.py b/src/examples/zero_shot_classification.py
new file mode 100644
index 0000000..130a1ef
--- /dev/null
+++ b/src/examples/zero_shot_classification.py
@@ -0,0 +1,46 @@
+"""
+This is an example using CLAP to perform zeroshot
+ classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from src.CLAPWrapper import CLAP
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+
+# Load dataset
+dataset = ESC50(root='data', download=False)
+prompt = 'this is a sound of '
+y = [prompt + x for x in dataset.classes]
+
+
+# Load and initialize CLAP
+weights_path = ''
+clap_model = CLAP(weights_path, use_cuda=False)
+
+
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+ x, _, one_hot_target = dataset.__getitem__(i)
+ audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+ similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+ y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+ y_preds.append(y_pred)
+ y_labels.append(one_hot_target.detach().cpu().numpy())
+
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+
+"""
+The output:
+
+ESC50 Accuracy: 82.6%
+
+"""
diff --git a/file_inference.py b/src/examples/zero_shot_predictions.py
similarity index 53%
rename from file_inference.py
rename to src/examples/zero_shot_predictions.py
index d1e5546..4333f51 100644
--- a/file_inference.py
+++ b/src/examples/zero_shot_predictions.py
@@ -1,29 +1,52 @@
-from CLAP_API import CLAP
+"""
+This is an example using CLAP for zero-shot
+ inference using ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from src.CLAPWrapper import CLAP
from esc50_dataset import ESC50
-import time
import torch.nn.functional as F
-# Load CLAP
-weights_path = 'C:\\Users\\sdeshmukh\\Desktop\\CLAP_package\\model\\new\\best.pth' # Add weight path here
-clap_model = CLAP(weights_path, use_cuda=False)
-
-# Load dataset
+# Load ESC50 dataset
dataset = ESC50(root='data', download=True) # set download=True when dataset is not downloaded
audio_file, target, one_hot_target = dataset[1000]
audio_file = [audio_file]
prompt = 'this is a sound of '
y = [prompt + x for x in dataset.classes]
-print('Computing text embeddings')
+# Load and initialize CLAP
+weights_path = ''
+
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAP(weights_path, use_cuda=False)
+
+# compute text embeddings from natural text
text_embeddings = clap_model.get_text_embeddings(y)
-print('Computing audio embeddings')
+
+# compute the audio embeddings from an audio file
audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
+
+# compute the similarity between audio_embeddings and text_embeddings
similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
similarity = F.softmax(similarity, dim=1)
values, indices = similarity[0].topk(5)
-# Print the result
+
+# view the results
print("Ground Truth: {}".format(target))
print("Top predictions:\n")
for value, index in zip(values, indices):
- print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
\ No newline at end of file
+ print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
+
+"""
+The output (the exact numbers may vary):
+
+Ground Truth: coughing
+Top predictions:
+
+ coughing: 86.34%
+ sneezing: 9.30%
+drinking sipping: 1.31%
+ laughing: 1.20%
+ glass breaking: 0.81%
+"""
\ No newline at end of file
diff --git a/CLAP_API/models/__init__.py b/src/models/__init__.py
similarity index 100%
rename from CLAP_API/models/__init__.py
rename to src/models/__init__.py
diff --git a/CLAP_API/models/audio.py b/src/models/audio.py
similarity index 100%
rename from CLAP_API/models/audio.py
rename to src/models/audio.py
diff --git a/CLAP_API/models/clap.py b/src/models/clap.py
similarity index 100%
rename from CLAP_API/models/clap.py
rename to src/models/clap.py
diff --git a/CLAP_API/models/utils.py b/src/models/utils.py
similarity index 100%
rename from CLAP_API/models/utils.py
rename to src/models/utils.py