Get DeepFormants working again

* Minor syntax tweaks to make the code Python 3 compatible * Fixes for various NumPy warnings/errors, either due to use of "float" where "int" is required, or domain errors on log functions * Replaced the use of the obsolete Python-2-only scikits.talkbox library with a compatible LPC implementation from the Conch project * Documentation update to indicate that an old version of "rnn" is required * Invoke Lua scripts via "luajit" directly, instead of going through the "th" frontend (to reduce the dependency footprint)
2020-01-17 15:57:00 -05:00
5 changed files with 317 additions and 30 deletions
@@ -24,9 +24,7 @@ Download the code. The code is based on signal processing package in Python call
 Dependencies:
 Run these lines in a terminal to install everything necessary for feature extraction.
 ```
-sudo apt-get install python-numpy python-scipy python-nose python-pip
-
-sudo pip install scikits.talkbox 
+sudo apt-get install python3-numpy python3-scipy python3-nose
 ```
 Next for the installation of Torch for loading the models run this.
 ```
@@ -37,30 +35,31 @@ cd ~/torch; bash install-deps;
 ./install.sh
 ```
 ```
-luarocks install rnn
+git clone https://github.com/Element-Research/rnn.git old-rnn
+cd old-rnn; luarocks make rocks/rnn-scm-1.rockspec
 ```
-The Estimation model can be downloaded here and because of size constraints the Tracking model can be abtained by download from this link
-[tracking_model.mat] (https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc)
+The Estimation model can be downloaded here and because of size constraints the Tracking model can be obtained by download from this link:
+[tracking_model.mat](https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc)

 ## How to use:

 For vowel formant estimation, call the main script in a terminal with the following inputs: wav file, formant output filename, and the vowel begin and end times:

 ```
-python formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
+python3 formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
 ```

 or the vowel begin and end times can be taken from a TextGrid file (here the name of the TextGrid is Example.TextGrid and the vowel is taken from a tier called "VOWEL"):

 ```
-python formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
+python3 formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
          --textgrid_tier VOWEL
 ```

 For formant tracking, just call the script with the wav file and output filename:

 ```
-python formants.py data/Example.wav data/ExamplePredictions.csv
+python3 formants.py data/Example.wav data/ExamplePredictions.csv
 ```


@@ -9,9 +9,9 @@ from os.path import isfile, join
 import math
 from scipy.fftpack.realtransforms import dct
 from scipy.signal import lfilter, hamming
-from copy import deepcopy
 from scipy.fftpack import fft, ifft
-from scikits.talkbox.linpred import lpc
+#from scikits.talkbox.linpred import lpc  # obsolete
+from helpers.conch_lpc import lpc
 import shutil
 from helpers.utilities import *

@@ -88,9 +88,9 @@ def periodogram(x, nfft=None, fs=1):

    pxx = np.abs(fft(x, nfft)) ** 2
    if nfft % 2 == 0:
-        pn = nfft / 2 + 1
+        pn = nfft // 2 + 1
    else:
-        pn = (nfft + 1 )/ 2
+        pn = (nfft + 1) // 2

    fgrid = np.linspace(0, fs * 0.5, pn)
    return pxx[:pn] / (n * fs), fgrid
@@ -137,9 +137,9 @@ def arspec(x, order, nfft=None, fs=1):

    # This is not enough to deal correctly with even/odd size
    if nfft % 2 == 0:
-        pn = nfft / 2 + 1
+        pn = nfft // 2 + 1
    else:
-        pn = (nfft + 1 )/ 2
+        pn = (nfft + 1) // 2

    px = 1 / np.fft.fft(a, nfft)[:pn]
    pxx = np.real(np.conj(px) * px)
@@ -200,7 +200,6 @@ def preemp(input, p):


 def arspecs(input_wav,order,Atal=False):
-    epsilon = 0.0000000001
    data = input_wav
    if Atal:
        ar = atal(data, order, 30)
@@ -211,8 +210,10 @@ def arspecs(input_wav,order,Atal=False):
        for k, l in zip(ars[0], ars[1]):
            ar.append(math.log(math.sqrt((k**2)+(l**2))))
        for val in range(0,len(ar)):
-            if ar[val] == 0.0:
-                ar[val] = deepcopy(epsilon)
+            if ar[val] < 0.0:
+                ar[val] = np.nan
+            elif ar[val] == 0.0:
+                ar[val] = epsilon
        mspec1 = np.log10(ar)
        # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
        ar = dct(mspec1, type=2, norm='ortho', axis=-1)
@@ -221,10 +222,10 @@ def arspecs(input_wav,order,Atal=False):

 def specPS(input_wav,pitch):
        N = len(input_wav)
-        samps = N/pitch
+        samps = N // pitch
        if samps == 0:
            samps = 1
-        frames = N/samps
+        frames = N // samps
        data = input_wav[0:frames]
        specs = periodogram(data,nfft=4096)
        for i in range(1,int(samps)):
@@ -236,10 +237,11 @@ def specPS(input_wav,pitch):
            specs[0][s] /= float(samps)
        peri = []
        for k, l in zip(specs[0], specs[1]):
-            if k == 0 and l == 0:
-                peri.append(epsilon)
-            else:
-                peri.append(math.log(math.sqrt((k ** 2) + (l ** 2))))
+            m = math.sqrt((k ** 2) + (l ** 2))
+            if m > 0: m = math.log(m)
+            if m == 0: m = epsilon
+            elif m < 0: m = np.nan
+            peri.append(m)
        # Filter the spectrum through the triangle filterbank
        mspec = np.log10(peri)
        # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
@@ -9,19 +9,19 @@ import shutil

 def predict_from_times(wav_filename, preds_filename, begin, end):
    tmp_features_filename = tempfile._get_default_tempdir() + "/" + next(tempfile._get_candidate_names()) + ".txt"
-    print tmp_features_filename
+    print(tmp_features_filename)

    if begin > 0.0 or end > 0.0:
        features.create_features(wav_filename, tmp_features_filename, begin, end)
-        easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
+        easy_call("luajit load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
    else:
        features.create_features(wav_filename, tmp_features_filename)
-        easy_call("th load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)
+        easy_call("luajit load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)


 def predict_from_textgrid(wav_filename, preds_filename, textgrid_filename, textgrid_tier):

-    print wav_filename
+    print(wav_filename)

    if os.path.exists(preds_filename):
        os.remove(preds_filename)
@@ -4,12 +4,12 @@ if [ $# -eq 2 ]
 then
    tempfile=`mktemp -t txt`
    python extract_features.py $1 $tempfile
-    th load_estimation_model.lua $tempfile $2
+    luajit load_estimation_model.lua $tempfile $2
 elif [ $# -eq 4 ]
 then
    tempfile=`mktemp -t txt`
    python extract_features.py $1 $tempfile --begin $3 --end $4
-    th load_estimation_model.lua $tempfile $2
+    luajit load_estimation_model.lua $tempfile $2
 else
    echo "$0 wav_filename pred_csv_filename [begin_time end_time]"
 fi
@@ -0,0 +1,286 @@
+# This file has been copied (with minor changes) from Michael
+# McAuliffe's Conch project, to provide a compatible replacement
+# implementation of the lpc() function from the obsolete Python-2-only
+# scikits.talkbox library.
+#
+# Conch repository: https://github.com/mmcauliffe/Conch-sounds
+# Source: https://github.com/mmcauliffe/Conch-sounds/blob/master/conch/analysis/formants/lpc.py
+
+# Copyright (c) 2015 Michael McAuliffe
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+#import librosa
+import numpy as np
+import scipy as sp
+from scipy.signal import lfilter
+
+from scipy.fftpack import fft, ifft
+from scipy.signal import gaussian
+
+#from ..helper import nextpow2
+#from ..functions import BaseAnalysisFunction
+
+# Source: https://github.com/mmcauliffe/Conch-sounds/blob/master/conch/analysis/helper.py
+def nextpow2(x):
+    """Return the first integer N such that 2**N >= abs(x)"""
+    return np.ceil(np.log2(np.abs(x)))
+
+def lpc_ref(signal, order):
+    """Compute the Linear Prediction Coefficients.
+
+    Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will
+    find the k+1 coefficients of a k order linear filter:
+
+      xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1]
+
+    Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized.
+
+    Parameters
+    ----------
+    signal: array_like
+        input signal
+    order : int
+        LPC order (the output will have order + 1 items)
+
+    Notes
+    ----
+    This is just for reference, as it is using the direct inversion of the
+    toeplitz matrix, which is really slow"""
+    if signal.ndim > 1:
+        raise ValueError("Array of rank > 1 not supported yet")
+    if order > signal.size:
+        raise ValueError("Input signal must have a lenght >= lpc order")
+
+    if order > 0:
+        p = order + 1
+        r = np.zeros(p, 'float32')
+        # Number of non zero values in autocorrelation one needs for p LPC
+        # coefficients
+        nx = np.min([p, signal.size])
+        x = np.correlate(signal, signal, 'full')
+        r[:nx] = x[signal.size - 1:signal.size + order]
+        phi = np.dot(sp.linalg.inv(sp.linalg.toeplitz(r[:-1])), -r[1:])
+        return np.concatenate(([1.], phi))
+    else:
+        return np.ones(1, dtype='float32')
+
+
+# @jit
+def levinson_1d(r, order):
+    """Levinson-Durbin recursion, to efficiently solve symmetric linear systems
+    with toeplitz structure.
+
+    Parameters
+    ---------
+    r : array-like
+        input array to invert (since the matrix is symmetric Toeplitz, the
+        corresponding pxp matrix is defined by p items only). Generally the
+        autocorrelation of the signal for linear prediction coefficients
+        estimation. The first item must be a non zero real.
+
+    Notes
+    ----
+    This implementation is in python, hence unsuitable for any serious
+    computation. Use it as educational and reference purpose only.
+
+    Levinson is a well-known algorithm to solve the Hermitian toeplitz
+    equation:
+
+                       _          _
+        -R[1] = R[0]   R[1]   ... R[p-1]    a[1]
+         :      :      :          :      *  :
+         :      :      :          _      *  :
+        -R[p] = R[p-1] R[p-2] ... R[0]      a[p]
+                       _
+    with respect to a (  is the complex conjugate). Using the special symmetry
+    in the matrix, the inversion can be done in O(p^2) instead of O(p^3).
+    """
+    r = np.atleast_1d(r)
+    if r.ndim > 1:
+        raise ValueError("Only rank 1 are supported for now.")
+
+    n = r.size
+    if n < 1:
+        raise ValueError("Cannot operate on empty array !")
+    elif order > n - 1:
+        raise ValueError("Order should be <= size-1")
+
+    if not np.isreal(r[0]):
+        raise ValueError("First item of input must be real.")
+    elif not np.isfinite(1 / r[0]):
+        raise ValueError("First item should be != 0")
+
+    # Estimated coefficients
+    a = np.empty(order + 1, 'float32')
+    # temporary array
+    t = np.empty(order + 1, 'float32')
+    # Reflection coefficients
+    k = np.empty(order, 'float32')
+
+    a[0] = 1.
+    e = r[0]
+
+    for i in range(1, order + 1):
+        acc = r[i]
+        for j in range(1, i):
+            acc += a[j] * r[i - j]
+        k[i - 1] = -acc / e
+        a[i] = k[i - 1]
+
+        for j in range(order):
+            t[j] = a[j]
+
+        for j in range(1, i):
+            a[j] += k[i - 1] * np.conj(t[i - j])
+
+        e *= 1 - k[i - 1] * np.conj(k[i - 1])
+
+    return a, e, k
+
+
+# @jit
+def _acorr_last_axis(x, nfft, maxlag):
+    a = np.real(ifft(np.abs(fft(x, n=nfft) ** 2)))
+    return a[..., :maxlag + 1] / x.shape[-1]
+
+
+# @jit
+def acorr_lpc(x, axis=-1):
+    """Compute autocorrelation of x along the given axis.
+
+    This compute the biased autocorrelation estimator (divided by the size of
+    input signal)
+
+    Notes
+    -----
+        The reason why we do not use acorr directly is for speed issue."""
+    if not np.isrealobj(x):
+        raise ValueError("Complex input not supported yet")
+
+    maxlag = x.shape[axis]
+    nfft = int(2 ** nextpow2(2 * maxlag - 1))
+
+    if axis != -1:
+        x = np.swapaxes(x, -1, axis)
+    a = _acorr_last_axis(x, nfft, maxlag)
+    if axis != -1:
+        a = np.swapaxes(a, -1, axis)
+    return a
+
+
+# @jit
+def lpc(signal, order, axis=-1):
+    """Compute the Linear Prediction Coefficients.
+
+    Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will
+    find the k+1 coefficients of a k order linear filter:
+
+      xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1]
+
+    Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized.
+
+    Parameters
+    ----------
+    signal: array_like
+        input signal
+    order : int
+        LPC order (the output will have order + 1 items)
+
+    Returns
+    -------
+    a : array-like
+        the solution of the inversion.
+    e : array-like
+        the prediction error.
+    k : array-like
+        reflection coefficients.
+
+    Notes
+    -----
+    This uses Levinson-Durbin recursion for the autocorrelation matrix
+    inversion, and fft for the autocorrelation computation.
+
+    For small order, particularly if order << signal size, direct computation
+    of the autocorrelation is faster: use levinson and correlate in this case."""
+    n = signal.shape[axis]
+    if order > n:
+        raise ValueError("Input signal must have length >= order")
+
+    r = acorr_lpc(signal, axis)
+    return levinson_1d(r, order)
+
+
+def process_frame(X, window, num_formants, new_sr):
+    X = X * window
+    A, e, k = lpc(X, num_formants * 2)
+
+    rts = np.roots(A)
+    rts = rts[np.where(np.imag(rts) >= 0)]
+    angz = np.arctan2(np.imag(rts), np.real(rts))
+    frqs = angz * (new_sr / (2 * np.pi))
+    frq_inds = np.argsort(frqs)
+    frqs = frqs[frq_inds]
+    bw = -1 / 2 * (new_sr / (2 * np.pi)) * np.log(np.abs(rts[frq_inds]))
+    return frqs, bw
+
+
+def lpc_formants(signal, sr, num_formants, max_freq, time_step,
+                 win_len, window_shape='gaussian'):
+    output = {}
+    new_sr = 2 * max_freq
+    alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr))
+    proc = lfilter([1., -alpha], 1, signal)
+    if sr > new_sr:
+        proc = librosa.resample(proc, sr, new_sr)
+    nperseg = int(win_len * new_sr)
+    nperstep = int(time_step * new_sr)
+    if window_shape == 'gaussian':
+        window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1]
+    else:
+        window = np.hanning(nperseg + 2)[1:nperseg + 1]
+    indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep)
+    num_frames = len(indices)
+    for i in range(num_frames):
+        if nperseg % 2 != 0:
+            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1]
+        else:
+            X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)]
+        frqs, bw = process_frame(X, window, num_formants, new_sr)
+        formants = []
+        for j, f in enumerate(frqs):
+            if f < 50:
+                continue
+            if f > max_freq - 50:
+                continue
+            formants.append((np.asscalar(f), np.asscalar(bw[j])))
+        missing = num_formants - len(formants)
+        if missing:
+            formants += [(None, None)] * missing
+        output[indices[i] / new_sr] = formants
+    return output
+
+
+#class FormantTrackFunction(BaseAnalysisFunction):
+#    def __init__(self, num_formants=5, max_frequency=5000,
+#                 time_step=0.01, window_length=0.025, window_shape='gaussian'):
+#        super(FormantTrackFunction, self).__init__()
+#        self.arguments = [num_formants, max_frequency, time_step, window_length, window_shape]
+#        self._function = lpc_formants
+#        self.requires_file = False