Merge pull request #25 from hykilpikonna/main

[O] Fix python 3.11 compatibility & optimize ease-of-use
This commit is contained in:
Benjamin Elizalde
2023-10-12 13:15:08 -07:00
committed by GitHub
24 changed files with 1817 additions and 93 deletions
+3
View File
@@ -348,3 +348,6 @@ MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
dist/
.DS_Store
._*
+12 -14
View File
@@ -4,20 +4,18 @@
CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.
<img width="832" alt="clap_diagrams" src="https://github.com/bmartin1/CLAP/assets/26778834/c5340a09-cc0c-4e41-ad5a-61546eaa824c">
<img width="832" alt="clap_diagrams" src="https://raw.githubusercontent.com/hykilpikonna/CLAP/main/docs/diagram.png">
## Setup
Install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
If you have [conda](https://www.anaconda.com) installed, you can run the following:
First, install python 3.8 or higher (3.11 recommended). Then, install CLAP using either of the following:
```shell
git clone https://github.com/microsoft/CLAP.git && \
cd CLAP && \
conda create -n clap python=3.10 && \
conda activate clap && \
pip install -r requirements.txt
# Install pypi pacakge
pip install msclap
# Or Install latest (unstable) git source
pip install git+https://github.com/microsoft/CLAP.git
```
## NEW CLAP weights
@@ -31,9 +29,9 @@ In `CLAP\src\`:
- Zero-Shot Classification and Retrieval
```python
# Load model (Choose between versions '2022' or '2023')
from CLAPWrapper import CLAPWrapper as CLAP
from msclap import CLAP
# Load model (Choose between versions '2022' or '2023')
clap_model = CLAP("<PATH TO WEIGHTS>", version = '2023', use_cuda=False)
# Extract text embeddings
@@ -48,9 +46,9 @@ similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)
- Audio Captioning
```python
# Load model (Choose version 'clapcap')
from CLAPWrapper import CLAPWrapper as CLAP
from msclap import CLAP
# Load model (Choose version 'clapcap')
clap_model = CLAP("<PATH TO WEIGHTS>", version = 'clapcap', use_cuda=False)
# Generate audio captions
@@ -58,7 +56,7 @@ captions = clap_model.generate_caption(file_paths: List[str])
```
## Examples
Take a look at `CLAP\src\` for usage examples.
Take a look at [examples](./examples/) for usage examples.
To run Zero-Shot Classification on the ESC50 dataset try the following:
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

@@ -1,11 +1,11 @@
"""
This is an example using CLAPCAP for audio captioning.
"""
from CLAPWrapper import CLAPWrapper
from msclap import CLAP
# Load and initialize CLAP
weights_path = "weights_path"
clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
clap_model = CLAP(weights_path, version = 'clapcap', use_cuda=False)
#Load audio files
audio_files = ['audio_file']
@@ -3,7 +3,7 @@ This is an example using CLAP to perform zeroshot
classification on ESC50 (https://github.com/karolpiczak/ESC-50).
"""
from CLAPWrapper import CLAPWrapper
from msclap import CLAP
from esc50_dataset import ESC50
import torch.nn.functional as F
import numpy as np
@@ -18,7 +18,7 @@ y = [prompt + x for x in dataset.classes]
# Load and initialize CLAP
weights_path = "weights_path"
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
clap_model = CLAP(weights_path, version = '2023', use_cuda=False)
# Computing text embeddings
text_embeddings = clap_model.get_text_embeddings(y)
@@ -1,7 +1,7 @@
"""
This is an example using CLAP for zero-shot inference.
"""
from CLAPWrapper import CLAPWrapper
from msclap import CLAP
import torch.nn.functional as F
# Define classes for zero-shot
@@ -17,7 +17,7 @@ audio_files = ['audio_file']
# Load and initialize CLAP
weights_path = "weights_path"
# Setting use_cuda = True will load the model on a GPU using CUDA
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
clap_model = CLAP(weights_path, version = '2023', use_cuda=False)
# compute text embeddings from natural text
text_embeddings = clap_model.get_text_embeddings(class_prompts)
+6 -7
View File
@@ -1,19 +1,18 @@
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
import random
import torchaudio
from torch._six import string_classes
import collections
import re
import numpy as np
from transformers import AutoTokenizer, logging
from models.clap import CLAP
from models.mapper import get_clapcap
from .models.clap import CLAP
from .models.mapper import get_clapcap
import math
import torchaudio.transforms as T
import os
import torch
from importlib_resources import files
import argparse
import yaml
import sys
@@ -42,7 +41,7 @@ class CLAPWrapper():
def get_config_path(self, version):
if version in self.supported_versions:
return files('configs').joinpath(f"config_{version}.yml").read_text()
return (Path(__file__).parent / f"configs/config_{version}.yml").read_text()
else:
raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
@@ -99,7 +98,7 @@ class CLAPWrapper():
# We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`:
# Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
clap.load_state_dict(model_state_dict)
clap.load_state_dict(model_state_dict, strict=False)
clap.eval() # set clap in eval mode
tokenizer = AutoTokenizer.from_pretrained(args.text_model)
@@ -184,7 +183,7 @@ class CLAPWrapper():
return torch.tensor(batch, dtype=torch.float64)
elif isinstance(elem, int):
return torch.tensor(batch)
elif isinstance(elem, string_classes):
elif isinstance(elem, str):
return batch
elif isinstance(elem, collections.abc.Mapping):
return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+1
View File
@@ -0,0 +1 @@
from .CLAPWrapper import CLAPWrapper as CLAP
@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from models.htsat import HTSATWrapper
from .htsat import HTSATWrapper
def get_audio_encoder(name: str):
if name == "Cnn14":
+3 -11
View File
@@ -6,11 +6,8 @@
# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
import logging
import pdb
import math
import random
from numpy.core.fromnumeric import clip, reshape
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint
@@ -19,15 +16,10 @@ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
from itertools import repeat
from typing import List
try:
from models.pytorch_utils import do_mixup, interpolate
import models.config as config
except:
from CLAP_API.models.pytorch_utils import do_mixup, interpolate
from CLAP_API.models import config
import torch.nn.functional as F
from .pytorch_utils import do_mixup, interpolate
from . import config
import collections.abc
import warnings
@@ -2,10 +2,9 @@
import torch
import torch.nn as nn
from torch.nn import functional as nnf
from torch.utils.data import Dataset, DataLoader
from enum import Enum
from transformers import GPT2LMHeadModel
from typing import Tuple, Optional, Union
from typing import Tuple, Optional
def get_clapcap(name: str):
if name == "ClapCaption":
@@ -1,5 +1,3 @@
import numpy as np
import time
import torch
import torch.nn as nn
Generated
+1754
View File
File diff suppressed because it is too large Load Diff
+30
View File
@@ -0,0 +1,30 @@
[tool.poetry]
name = "msclap"
version = "1.3.2"
description = "CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning."
authors = ["Benjamin Elizalde and Soham Deshmukh and Huaming Wang"]
license = "MIT"
readme = "README.md"
packages = [
{ include = "msclap" },
]
[tool.poetry.dependencies]
python = "^3.8"
librosa = "^0.10.1"
numpy = "^1.23.0"
numba = "^0.58.0"
pandas = "^2.0.0"
torch = "^2.1.0"
torchaudio = "^2.1.0"
torchlibrosa = "^0.1.0"
torchvision = "^0.16.0"
tqdm = "^4.66.1"
transformers = "^4.34.0"
pyyaml = "^6.0.1"
scikit-learn = "^1.3.1"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
-50
View File
@@ -1,50 +0,0 @@
appdirs==1.4.4
audioread==3.0.0
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==3.0.1
colorama==0.4.6
decorator==5.1.1
filelock==3.9.0
flit_core==3.6.0
huggingface-hub==0.12.1
idna==3.4
importlib-metadata==6.0.0
importlib-resources==5.12.0
jaraco.classes==3.2.3
joblib==1.2.0
lazy_loader==0.1
librosa==0.10.0
llvmlite==0.39.1
mkl-service==2.4.0
more-itertools==9.0.0
msgpack==1.0.4
numba==0.56.4
numpy==1.23.5
packaging==23.0
pandas==1.4.2
pooch==1.6.0
pycparser==2.21
pywin32-ctypes==0.2.0
PyYAML==6.0
regex==2022.10.31
requests==2.28.2
scikit-learn==1.2.1
scipy==1.10.1
setuptools==65.6.3
six==1.16.0
soundfile==0.12.1
soxr==0.3.3
threadpoolctl==3.1.0
tokenizers==0.13.2
torch==1.13.1
torchaudio==0.13.1
torchlibrosa==0.1.0
torchvision==0.14.1
tqdm==4.64.1
transformers==4.26.1
typing_extensions==4.4.0
urllib3==1.26.14
wheel==0.38.4
wincertstore==0.2
zipp==3.14.0
View File