[M] Move examples to /examples

2023-10-11 19:26:05 -04:00
parent 3788d4e225
commit b41935ff3c
4 changed files with 0 additions and 0 deletions
@@ -0,0 +1,25 @@
+"""
+This is an example using CLAPCAP for audio captioning.
+"""
+from CLAPWrapper import CLAPWrapper
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
+
+#Load audio files
+audio_files = ['audio_file']
+
+# Generate captions for the recording
+captions = clap_model.generate_caption(audio_files, resample=True, beam_size=5, entry_length=67, temperature=0.01)
+
+# Print the result
+for i in range(len(audio_files)):
+    print(f"Audio file: {audio_files[i]} \n")
+    print(f"Generated caption: {captions[i]} \n")
+
+"""
+The output (the exact caption may vary):
+
+The birds are singing in the trees.
+"""
@@ -0,0 +1,82 @@
+from torch.utils.data import Dataset
+from torchvision.datasets.utils import download_url
+from tqdm import tqdm
+import pandas as pd
+import os
+import torch.nn as nn
+import torch
+
+class AudioDataset(Dataset):
+    def __init__(self, root: str, download: bool = True):
+        self.root = os.path.expanduser(root)
+        if download:
+            self.download()
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def download(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class ESC50(AudioDataset):
+    base_folder = 'ESC-50-master'
+    url = "https://github.com/karoldvl/ESC-50/archive/master.zip"
+    filename = "ESC-50-master.zip"
+    num_files_in_dir = 2000
+    audio_dir = 'audio'
+    label_col = 'category'
+    file_col = 'filename'
+    meta = {
+        'filename': os.path.join('meta','esc50.csv'),
+    }
+
+    def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
+        super().__init__(root)
+        self._load_meta()
+
+        self.targets, self.audio_paths = [], []
+        self.pre_transformations = reading_transformations
+        print("Loading audio files")
+        # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
+        self.df['category'] = self.df['category'].str.replace('_',' ')
+
+        for _, row in tqdm(self.df.iterrows()):
+            file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
+            self.targets.append(row[self.label_col])
+            self.audio_paths.append(file_path)
+
+    def _load_meta(self):
+        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
+
+        self.df = pd.read_csv(path)
+        self.class_to_idx = {}
+        self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
+        for i, category in enumerate(self.classes):
+            self.class_to_idx[category] = i
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        file_path, target = self.audio_paths[index], self.targets[index]
+        idx = torch.tensor(self.class_to_idx[target])
+        one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
+        return file_path, target, one_hot_target
+
+    def __len__(self):
+        return len(self.audio_paths)
+
+    def download(self):
+        download_url(self.url, self.root, self.filename)
+
+        # extract file
+        from zipfile import ZipFile
+        with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
+            zip.extractall(path=self.root)
@@ -0,0 +1,46 @@
+"""
+This is an example using CLAP to perform zeroshot
+    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from CLAPWrapper import CLAPWrapper
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+
+# Load dataset
+root_path = "root_path" # Folder with ESC-50-master/
+dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
+prompt = 'this is the sound of '
+y = [prompt + x for x in dataset.classes]
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+    x, _, one_hot_target = dataset.__getitem__(i)
+    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+    y_preds.append(y_pred)
+    y_labels.append(one_hot_target.detach().cpu().numpy())
+
+
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+
+"""
+The output:
+
+ESC50 Accuracy: 93.9%
+
+"""
@@ -0,0 +1,51 @@
+"""
+This is an example using CLAP for zero-shot inference.
+"""
+from CLAPWrapper import CLAPWrapper
+import torch.nn.functional as F
+
+# Define classes for zero-shot
+# Should be in lower case and can be more than one word
+classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
+ground_truth = ['coughing']
+# Add prompt
+prompt = 'this is a sound of '
+class_prompts = [prompt + x for x in classes]
+#Load audio files
+audio_files = ['audio_file']
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+
+# compute text embeddings from natural text
+text_embeddings = clap_model.get_text_embeddings(class_prompts)
+
+# compute the audio embeddings from an audio file
+audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)
+
+# compute the similarity between audio_embeddings and text_embeddings
+similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+
+similarity = F.softmax(similarity, dim=1)
+values, indices = similarity[0].topk(5)
+
+# Print the results
+print("Ground Truth: {}".format(ground_truth))
+print("Top predictions:\n")
+for value, index in zip(values, indices):
+    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")
+
+"""
+The output (the exact numbers may vary):
+
+Ground Truth: coughing
+Top predictions:
+
+        coughing: 98.55%
+        sneezing: 1.24%
+drinking sipping: 0.15%
+       breathing: 0.02%
+  brushing teeth: 0.01%
+"""