Merge pull request #25 from hykilpikonna/main
[O] Fix python 3.11 compatibility & optimize ease-of-use
This commit is contained in:
@@ -348,3 +348,6 @@ MigrationBackup/
|
||||
|
||||
# Ionide (cross platform F# VS Code tools) working folder
|
||||
.ionide/
|
||||
dist/
|
||||
.DS_Store
|
||||
._*
|
||||
@@ -4,20 +4,18 @@
|
||||
|
||||
CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.
|
||||
|
||||
<img width="832" alt="clap_diagrams" src="https://github.com/bmartin1/CLAP/assets/26778834/c5340a09-cc0c-4e41-ad5a-61546eaa824c">
|
||||
<img width="832" alt="clap_diagrams" src="https://raw.githubusercontent.com/hykilpikonna/CLAP/main/docs/diagram.png">
|
||||
|
||||
## Setup
|
||||
|
||||
Install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
|
||||
|
||||
If you have [conda](https://www.anaconda.com) installed, you can run the following:
|
||||
First, install python 3.8 or higher (3.11 recommended). Then, install CLAP using either of the following:
|
||||
|
||||
```shell
|
||||
git clone https://github.com/microsoft/CLAP.git && \
|
||||
cd CLAP && \
|
||||
conda create -n clap python=3.10 && \
|
||||
conda activate clap && \
|
||||
pip install -r requirements.txt
|
||||
# Install pypi pacakge
|
||||
pip install msclap
|
||||
|
||||
# Or Install latest (unstable) git source
|
||||
pip install git+https://github.com/microsoft/CLAP.git
|
||||
```
|
||||
|
||||
## NEW CLAP weights
|
||||
@@ -31,9 +29,9 @@ In `CLAP\src\`:
|
||||
|
||||
- Zero-Shot Classification and Retrieval
|
||||
```python
|
||||
# Load model (Choose between versions '2022' or '2023')
|
||||
from CLAPWrapper import CLAPWrapper as CLAP
|
||||
from msclap import CLAP
|
||||
|
||||
# Load model (Choose between versions '2022' or '2023')
|
||||
clap_model = CLAP("<PATH TO WEIGHTS>", version = '2023', use_cuda=False)
|
||||
|
||||
# Extract text embeddings
|
||||
@@ -48,9 +46,9 @@ similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)
|
||||
|
||||
- Audio Captioning
|
||||
```python
|
||||
# Load model (Choose version 'clapcap')
|
||||
from CLAPWrapper import CLAPWrapper as CLAP
|
||||
from msclap import CLAP
|
||||
|
||||
# Load model (Choose version 'clapcap')
|
||||
clap_model = CLAP("<PATH TO WEIGHTS>", version = 'clapcap', use_cuda=False)
|
||||
|
||||
# Generate audio captions
|
||||
@@ -58,7 +56,7 @@ captions = clap_model.generate_caption(file_paths: List[str])
|
||||
```
|
||||
|
||||
## Examples
|
||||
Take a look at `CLAP\src\` for usage examples.
|
||||
Take a look at [examples](./examples/) for usage examples.
|
||||
|
||||
To run Zero-Shot Classification on the ESC50 dataset try the following:
|
||||
|
||||
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 81 KiB |
@@ -1,11 +1,11 @@
|
||||
"""
|
||||
This is an example using CLAPCAP for audio captioning.
|
||||
"""
|
||||
from CLAPWrapper import CLAPWrapper
|
||||
from msclap import CLAP
|
||||
|
||||
# Load and initialize CLAP
|
||||
weights_path = "weights_path"
|
||||
clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
|
||||
clap_model = CLAP(weights_path, version = 'clapcap', use_cuda=False)
|
||||
|
||||
#Load audio files
|
||||
audio_files = ['audio_file']
|
||||
@@ -3,7 +3,7 @@ This is an example using CLAP to perform zeroshot
|
||||
classification on ESC50 (https://github.com/karolpiczak/ESC-50).
|
||||
"""
|
||||
|
||||
from CLAPWrapper import CLAPWrapper
|
||||
from msclap import CLAP
|
||||
from esc50_dataset import ESC50
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
@@ -18,7 +18,7 @@ y = [prompt + x for x in dataset.classes]
|
||||
|
||||
# Load and initialize CLAP
|
||||
weights_path = "weights_path"
|
||||
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
|
||||
clap_model = CLAP(weights_path, version = '2023', use_cuda=False)
|
||||
|
||||
# Computing text embeddings
|
||||
text_embeddings = clap_model.get_text_embeddings(y)
|
||||
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
This is an example using CLAP for zero-shot inference.
|
||||
"""
|
||||
from CLAPWrapper import CLAPWrapper
|
||||
from msclap import CLAP
|
||||
import torch.nn.functional as F
|
||||
|
||||
# Define classes for zero-shot
|
||||
@@ -17,7 +17,7 @@ audio_files = ['audio_file']
|
||||
# Load and initialize CLAP
|
||||
weights_path = "weights_path"
|
||||
# Setting use_cuda = True will load the model on a GPU using CUDA
|
||||
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
|
||||
clap_model = CLAP(weights_path, version = '2023', use_cuda=False)
|
||||
|
||||
# compute text embeddings from natural text
|
||||
text_embeddings = clap_model.get_text_embeddings(class_prompts)
|
||||
@@ -1,19 +1,18 @@
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
import random
|
||||
import torchaudio
|
||||
from torch._six import string_classes
|
||||
import collections
|
||||
import re
|
||||
import numpy as np
|
||||
from transformers import AutoTokenizer, logging
|
||||
from models.clap import CLAP
|
||||
from models.mapper import get_clapcap
|
||||
from .models.clap import CLAP
|
||||
from .models.mapper import get_clapcap
|
||||
import math
|
||||
import torchaudio.transforms as T
|
||||
import os
|
||||
import torch
|
||||
from importlib_resources import files
|
||||
import argparse
|
||||
import yaml
|
||||
import sys
|
||||
@@ -42,7 +41,7 @@ class CLAPWrapper():
|
||||
|
||||
def get_config_path(self, version):
|
||||
if version in self.supported_versions:
|
||||
return files('configs').joinpath(f"config_{version}.yml").read_text()
|
||||
return (Path(__file__).parent / f"configs/config_{version}.yml").read_text()
|
||||
else:
|
||||
raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
|
||||
|
||||
@@ -99,7 +98,7 @@ class CLAPWrapper():
|
||||
|
||||
# We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`:
|
||||
# Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
|
||||
clap.load_state_dict(model_state_dict)
|
||||
clap.load_state_dict(model_state_dict, strict=False)
|
||||
|
||||
clap.eval() # set clap in eval mode
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.text_model)
|
||||
@@ -184,7 +183,7 @@ class CLAPWrapper():
|
||||
return torch.tensor(batch, dtype=torch.float64)
|
||||
elif isinstance(elem, int):
|
||||
return torch.tensor(batch)
|
||||
elif isinstance(elem, string_classes):
|
||||
elif isinstance(elem, str):
|
||||
return batch
|
||||
elif isinstance(elem, collections.abc.Mapping):
|
||||
return {key: self.default_collate([d[key] for d in batch]) for key in elem}
|
||||
@@ -0,0 +1 @@
|
||||
from .CLAPWrapper import CLAPWrapper as CLAP
|
||||
@@ -2,7 +2,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
|
||||
from models.htsat import HTSATWrapper
|
||||
from .htsat import HTSATWrapper
|
||||
|
||||
def get_audio_encoder(name: str):
|
||||
if name == "Cnn14":
|
||||
@@ -6,11 +6,8 @@
|
||||
# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
|
||||
|
||||
|
||||
import logging
|
||||
import pdb
|
||||
import math
|
||||
import random
|
||||
from numpy.core.fromnumeric import clip, reshape
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint as checkpoint
|
||||
@@ -19,15 +16,10 @@ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
|
||||
from torchlibrosa.augmentation import SpecAugmentation
|
||||
|
||||
from itertools import repeat
|
||||
from typing import List
|
||||
try:
|
||||
from models.pytorch_utils import do_mixup, interpolate
|
||||
import models.config as config
|
||||
except:
|
||||
from CLAP_API.models.pytorch_utils import do_mixup, interpolate
|
||||
from CLAP_API.models import config
|
||||
|
||||
import torch.nn.functional as F
|
||||
from .pytorch_utils import do_mixup, interpolate
|
||||
from . import config
|
||||
|
||||
import collections.abc
|
||||
import warnings
|
||||
|
||||
@@ -2,10 +2,9 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as nnf
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from enum import Enum
|
||||
from transformers import GPT2LMHeadModel
|
||||
from typing import Tuple, Optional, Union
|
||||
from typing import Tuple, Optional
|
||||
|
||||
def get_clapcap(name: str):
|
||||
if name == "ClapCaption":
|
||||
@@ -1,5 +1,3 @@
|
||||
import numpy as np
|
||||
import time
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
Generated
+1754
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,30 @@
|
||||
[tool.poetry]
|
||||
name = "msclap"
|
||||
version = "1.3.2"
|
||||
description = "CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning."
|
||||
authors = ["Benjamin Elizalde and Soham Deshmukh and Huaming Wang"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
packages = [
|
||||
{ include = "msclap" },
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
librosa = "^0.10.1"
|
||||
numpy = "^1.23.0"
|
||||
numba = "^0.58.0"
|
||||
pandas = "^2.0.0"
|
||||
torch = "^2.1.0"
|
||||
torchaudio = "^2.1.0"
|
||||
torchlibrosa = "^0.1.0"
|
||||
torchvision = "^0.16.0"
|
||||
tqdm = "^4.66.1"
|
||||
transformers = "^4.34.0"
|
||||
pyyaml = "^6.0.1"
|
||||
scikit-learn = "^1.3.1"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
@@ -1,50 +0,0 @@
|
||||
appdirs==1.4.4
|
||||
audioread==3.0.0
|
||||
certifi==2022.12.7
|
||||
cffi==1.15.1
|
||||
charset-normalizer==3.0.1
|
||||
colorama==0.4.6
|
||||
decorator==5.1.1
|
||||
filelock==3.9.0
|
||||
flit_core==3.6.0
|
||||
huggingface-hub==0.12.1
|
||||
idna==3.4
|
||||
importlib-metadata==6.0.0
|
||||
importlib-resources==5.12.0
|
||||
jaraco.classes==3.2.3
|
||||
joblib==1.2.0
|
||||
lazy_loader==0.1
|
||||
librosa==0.10.0
|
||||
llvmlite==0.39.1
|
||||
mkl-service==2.4.0
|
||||
more-itertools==9.0.0
|
||||
msgpack==1.0.4
|
||||
numba==0.56.4
|
||||
numpy==1.23.5
|
||||
packaging==23.0
|
||||
pandas==1.4.2
|
||||
pooch==1.6.0
|
||||
pycparser==2.21
|
||||
pywin32-ctypes==0.2.0
|
||||
PyYAML==6.0
|
||||
regex==2022.10.31
|
||||
requests==2.28.2
|
||||
scikit-learn==1.2.1
|
||||
scipy==1.10.1
|
||||
setuptools==65.6.3
|
||||
six==1.16.0
|
||||
soundfile==0.12.1
|
||||
soxr==0.3.3
|
||||
threadpoolctl==3.1.0
|
||||
tokenizers==0.13.2
|
||||
torch==1.13.1
|
||||
torchaudio==0.13.1
|
||||
torchlibrosa==0.1.0
|
||||
torchvision==0.14.1
|
||||
tqdm==4.64.1
|
||||
transformers==4.26.1
|
||||
typing_extensions==4.4.0
|
||||
urllib3==1.26.14
|
||||
wheel==0.38.4
|
||||
wincertstore==0.2
|
||||
zipp==3.14.0
|
||||
Reference in New Issue
Block a user