Merge pull request #25 from hykilpikonna/main

[O] Fix python 3.11 compatibility & optimize ease-of-use
2023-10-12 13:15:08 -07:00
parent 03703c0e91 3f6ef7382f
commit 0e37184ad1
24 changed files with 1817 additions and 93 deletions
@@ -348,3 +348,6 @@ MigrationBackup/

 # Ionide (cross platform F# VS Code tools) working folder
 .ionide/
+dist/
+.DS_Store
+._*
@@ -4,20 +4,18 @@

 CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.

-<img width="832" alt="clap_diagrams" src="https://github.com/bmartin1/CLAP/assets/26778834/c5340a09-cc0c-4e41-ad5a-61546eaa824c">
+<img width="832" alt="clap_diagrams" src="https://raw.githubusercontent.com/hykilpikonna/CLAP/main/docs/diagram.png">

 ## Setup

-Install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
-
-If you have [conda](https://www.anaconda.com) installed, you can run the following: 
+First, install python 3.8 or higher (3.11 recommended). Then, install CLAP using either of the following:

 ```shell
-git clone https://github.com/microsoft/CLAP.git && \
-cd CLAP && \
-conda create -n clap python=3.10 && \
-conda activate clap && \
-pip install -r requirements.txt
+# Install pypi pacakge
+pip install msclap
+
+# Or Install latest (unstable) git source
+pip install git+https://github.com/microsoft/CLAP.git
 ```

 ## NEW CLAP weights
@@ -31,9 +29,9 @@ In `CLAP\src\`:

 - Zero-Shot Classification and Retrieval
 ```python
-# Load model (Choose between versions '2022' or '2023')
-from CLAPWrapper import CLAPWrapper as CLAP 
+from msclap import CLAP

+# Load model (Choose between versions '2022' or '2023')
 clap_model = CLAP("<PATH TO WEIGHTS>", version = '2023', use_cuda=False)

 # Extract text embeddings
@@ -48,9 +46,9 @@ similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)

 - Audio Captioning
 ```python
-# Load model (Choose version 'clapcap')
-from CLAPWrapper import CLAPWrapper as CLAP 
+from msclap import CLAP

+# Load model (Choose version 'clapcap')
 clap_model = CLAP("<PATH TO WEIGHTS>", version = 'clapcap', use_cuda=False)

 # Generate audio captions
@@ -58,7 +56,7 @@ captions = clap_model.generate_caption(file_paths: List[str])
 ```

 ## Examples
-Take a look at `CLAP\src\` for usage examples. 
+Take a look at [examples](./examples/) for usage examples. 

 To run Zero-Shot Classification on the ESC50 dataset try the following:

@@ -1,11 +1,11 @@
 """
 This is an example using CLAPCAP for audio captioning.
 """
-from CLAPWrapper import CLAPWrapper
+from msclap import CLAP

 # Load and initialize CLAP
 weights_path = "weights_path"
-clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
+clap_model = CLAP(weights_path, version = 'clapcap', use_cuda=False)

 #Load audio files
 audio_files = ['audio_file']
@@ -3,7 +3,7 @@ This is an example using CLAP to perform zeroshot
    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
 """

-from CLAPWrapper import CLAPWrapper
+from msclap import CLAP
 from esc50_dataset import ESC50
 import torch.nn.functional as F
 import numpy as np
@@ -18,7 +18,7 @@ y = [prompt + x for x in dataset.classes]

 # Load and initialize CLAP
 weights_path = "weights_path"
-clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+clap_model = CLAP(weights_path, version = '2023', use_cuda=False)

 # Computing text embeddings
 text_embeddings = clap_model.get_text_embeddings(y)
@@ -1,7 +1,7 @@
 """
 This is an example using CLAP for zero-shot inference.
 """
-from CLAPWrapper import CLAPWrapper
+from msclap import CLAP
 import torch.nn.functional as F

 # Define classes for zero-shot
@@ -17,7 +17,7 @@ audio_files = ['audio_file']
 # Load and initialize CLAP
 weights_path = "weights_path"
 # Setting use_cuda = True will load the model on a GPU using CUDA
-clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+clap_model = CLAP(weights_path, version = '2023', use_cuda=False)

 # compute text embeddings from natural text
 text_embeddings = clap_model.get_text_embeddings(class_prompts)
@@ -1,19 +1,18 @@
+from pathlib import Path
 import warnings
 warnings.filterwarnings("ignore")
 import random
 import torchaudio
-from torch._six import string_classes
 import collections
 import re
 import numpy as np
 from transformers import AutoTokenizer, logging
-from models.clap import CLAP
-from models.mapper import get_clapcap
+from .models.clap import CLAP
+from .models.mapper import get_clapcap
 import math
 import torchaudio.transforms as T
 import os
 import torch
-from importlib_resources import files
 import argparse
 import yaml
 import sys
@@ -42,7 +41,7 @@ class CLAPWrapper():
    
    def get_config_path(self, version):
        if version in self.supported_versions:
-            return files('configs').joinpath(f"config_{version}.yml").read_text()
+            return (Path(__file__).parent / f"configs/config_{version}.yml").read_text()
        else:
            raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
    
@@ -99,7 +98,7 @@ class CLAPWrapper():

        # We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`: 
        # Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
-        clap.load_state_dict(model_state_dict)
+        clap.load_state_dict(model_state_dict, strict=False)

        clap.eval()  # set clap in eval mode
        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
@@ -184,7 +183,7 @@ class CLAPWrapper():
            return torch.tensor(batch, dtype=torch.float64)
        elif isinstance(elem, int):
            return torch.tensor(batch)
-        elif isinstance(elem, string_classes):
+        elif isinstance(elem, str):
            return batch
        elif isinstance(elem, collections.abc.Mapping):
            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
@@ -0,0 +1 @@
+from .CLAPWrapper import CLAPWrapper as CLAP
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchlibrosa.stft import Spectrogram, LogmelFilterBank
-from models.htsat import HTSATWrapper
+from .htsat import HTSATWrapper

 def get_audio_encoder(name: str):
    if name == "Cnn14":
@@ -6,11 +6,8 @@
 # Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf


-import logging
-import pdb
 import math
 import random
-from numpy.core.fromnumeric import clip, reshape
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint as checkpoint
@@ -19,15 +16,10 @@ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
 from torchlibrosa.augmentation import SpecAugmentation

 from itertools import repeat
-from typing import List
-try:
-    from models.pytorch_utils import do_mixup, interpolate
-    import models.config as config
-except:
-    from CLAP_API.models.pytorch_utils import do_mixup, interpolate
-    from CLAP_API.models import config

-import torch.nn.functional as F
+from .pytorch_utils import do_mixup, interpolate
+from . import config
+
 import collections.abc
 import warnings

@@ -2,10 +2,9 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as nnf
-from torch.utils.data import Dataset, DataLoader
 from enum import Enum
 from transformers import GPT2LMHeadModel
-from typing import Tuple, Optional, Union
+from typing import Tuple, Optional

 def get_clapcap(name: str):
    if name == "ClapCaption":
@@ -1,5 +1,3 @@
-import numpy as np
-import time
 import torch
 import torch.nn as nn

@@ -0,0 +1,30 @@
+[tool.poetry]
+name = "msclap"
+version = "1.3.2"
+description = "CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning."
+authors = ["Benjamin Elizalde and Soham Deshmukh and Huaming Wang"]
+license = "MIT"
+readme = "README.md"
+packages = [
+    { include = "msclap" },
+]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+librosa = "^0.10.1"
+numpy = "^1.23.0"
+numba = "^0.58.0"
+pandas = "^2.0.0"
+torch = "^2.1.0"
+torchaudio = "^2.1.0"
+torchlibrosa = "^0.1.0"
+torchvision = "^0.16.0"
+tqdm = "^4.66.1"
+transformers = "^4.34.0"
+pyyaml = "^6.0.1"
+scikit-learn = "^1.3.1"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
@@ -1,50 +0,0 @@
-appdirs==1.4.4
-audioread==3.0.0
-certifi==2022.12.7
-cffi==1.15.1
-charset-normalizer==3.0.1
-colorama==0.4.6
-decorator==5.1.1
-filelock==3.9.0
-flit_core==3.6.0
-huggingface-hub==0.12.1
-idna==3.4
-importlib-metadata==6.0.0
-importlib-resources==5.12.0
-jaraco.classes==3.2.3
-joblib==1.2.0
-lazy_loader==0.1
-librosa==0.10.0
-llvmlite==0.39.1
-mkl-service==2.4.0
-more-itertools==9.0.0
-msgpack==1.0.4
-numba==0.56.4
-numpy==1.23.5
-packaging==23.0
-pandas==1.4.2
-pooch==1.6.0
-pycparser==2.21
-pywin32-ctypes==0.2.0
-PyYAML==6.0
-regex==2022.10.31
-requests==2.28.2
-scikit-learn==1.2.1
-scipy==1.10.1
-setuptools==65.6.3
-six==1.16.0
-soundfile==0.12.1
-soxr==0.3.3
-threadpoolctl==3.1.0
-tokenizers==0.13.2
-torch==1.13.1
-torchaudio==0.13.1
-torchlibrosa==0.1.0
-torchvision==0.14.1
-tqdm==4.64.1
-transformers==4.26.1
-typing_extensions==4.4.0
-urllib3==1.26.14
-wheel==0.38.4
-wincertstore==0.2
-zipp==3.14.0
				`@@ -0,0 +1 @@`
				`from .CLAPWrapper import CLAPWrapper as CLAP`