[+] Compute tilt

2022-03-20 11:46:02 -04:00
parent 9ea4acc6ba
commit 10df2c3e55
2 changed files with 95 additions and 12 deletions
@@ -1,13 +1,13 @@
 from __future__ import annotations

 import csv
+import json
 import os
 from dataclasses import dataclass
 from multiprocessing import Pool
 from os import PathLike
 from pathlib import Path
 from typing import Iterable, Literal
-
 import jsonpickle as jsonpickle
 import matplotlib.pyplot as plt
 import numpy
@@ -18,9 +18,12 @@ import tqdm
 import seaborn as sns
 from matplotlib.patches import Patch

+from spectral_tilt import tilt
+
 ASAB = Literal['f', 'm']
 COLOR_PINK = '#F5A9B8'
 COLOR_BLUE = '#5BCEFA'
+CPU_CORES = 36


 def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray:
@@ -33,7 +36,6 @@ def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndar
    """
    pitch_values = audio.to_pitch(0.01).selected_array['frequency']
    formant_values = audio.to_formant_burg(0.01)
-
    result = numpy.ndarray([len(pitch_values), 4], 'float32')

    for i in range(len(pitch_values)):
@@ -104,17 +106,23 @@ def get_audio_paths(id_dir: Path, audio_suffix: str = 'wav') -> list[str]:
    return audios


-def compute_vox_celeb_helper(aud_dir: str):
+def compute_vox_celeb_freq(aud_dir: str):
    """
-    Compute one audio file
-
-    :param aud_dir: Audio file path
-    :return: None
+    Compute and save the frequency info of one audio file
    """
    array = calculate_freq_info(parselmouth.Sound(aud_dir))
    numpy.save(aud_dir, array)


+def compute_vox_celeb_tilt(aud_dir: str):
+    """
+    Compute and save the tilt info of one audio file
+    """
+    spectral_tilt = tilt(parselmouth.Sound(aud_dir))
+    with open(Path(aud_dir).with_suffix('.json'), 'w', encoding='utf-8') as f:
+        json.dump({'tilt': spectral_tilt}, f)
+
+
 def compute_vox_celeb():
    print('Finding audio files...')
    queue: list[str] = []
@@ -127,8 +135,8 @@ def compute_vox_celeb():
    print('Starting processing...')

    # Compute audio files in a cpu pool
-    with Pool(8) as pool:
-        for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_helper, queue), total=len(queue)):
+    with Pool(CPU_CORES) as pool:
+        for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_tilt, queue), total=len(queue)):
            pass


@@ -206,7 +214,7 @@ def vox_celeb_statistics():
    id_dirs = [id_dir for id, id_dir in loop_id_dirs()]

    # Loop through all ids
-    with Pool(8) as pool:
+    with Pool(CPU_CORES) as pool:
        for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)):
            pass

@@ -255,6 +263,7 @@ def collect_statistics():
    fig, ax = subplots(figsize=(10, 5))
    # ax.set_xscale('log')
    #print(sns.load_dataset('tips'))
+
    print("Pitch")
    print(calc_col_stats(f_means[:, 0]))
    print(calc_col_stats(m_means[:, 0]))
@@ -267,6 +276,7 @@ def collect_statistics():
    print("F3")
    print(calc_col_stats(f_means[:, 3]))
    print(calc_col_stats(m_means[:, 3]))
+
    df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)})
    dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)})
    # data.boxplot()
@@ -301,5 +311,5 @@ if __name__ == '__main__':
    # print(calculate_freq_info(parselmouth.Sound('../00001.wav')))
    # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac')))
    # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac')))
-    # vox_celeb_statistics()
-    collect_statistics()
+    compute_vox_celeb()
+    # collect_statistics()
@@ -0,0 +1,73 @@
+import math
+import parselmouth
+
+from parselmouth.praat import call
+
+
+# https://github.com/Voice-Lab/VoiceLab/blob/2edf9678866eb5f5f230bf1578e1aa418f7f4917/Voicelab/toolkits/Voicelab/MeasureSpectralTiltNode.py
+# The closer to positive the creakier it is#
+def tilt(sound):
+    # read the sound
+    window_length_in_millisecs = 64
+    window_length = window_length_in_millisecs / 1000
+
+    # Compute begin and end times, set window
+    end = call(sound, "Get end time")
+    midpoint = end / 2
+
+    begintime = midpoint - (window_length / 2)
+    endtime = midpoint + (window_length / 2)
+    part_to_measure = sound.extract_part(begintime, endtime)
+    spectrum = part_to_measure.to_spectrum()
+    total_bins = spectrum.get_number_of_bins()
+    dBValue = []
+    bins = []
+
+    # convert spectral values to dB
+    for bin in range(total_bins):
+        bin_number = bin + 1
+        realValue = spectrum.get_real_value_in_bin(bin_number)
+        imagValue = spectrum.get_imaginary_value_in_bin(bin_number)
+        rmsPower = math.sqrt((realValue ** 2) + (imagValue ** 2))
+        if rmsPower <= 0:
+            print(f'Error: rmsPower={rmsPower}, needs to be positive!')
+            return None
+        db = 20 * (math.log10(rmsPower / 0.0002))
+        dBValue.append(db)
+        bin_number += 1
+        bins.append(bin)
+
+    # find maximum dB value, for rescaling purposes
+    maxdB = max(dBValue)
+    mindB = min(dBValue)  # this is wrong in Owren's script, where mindB = 0
+    rangedB = maxdB - mindB
+
+    # stretch the spectrum to a normalized range that matches the number of frequency values
+    scalingConstant = (total_bins - 1) / rangedB
+    scaled_dB_values = []
+    for value in dBValue:
+        scaled_dBvalue = value + abs(mindB)
+        scaled_dBvalue *= scalingConstant
+        scaled_dB_values.append(scaled_dBvalue)
+
+    # find slope
+    sumXX = 0
+    sumXY = 0
+    sumX = sum(bins)
+    sumY = sum(scaled_dB_values)
+
+    for bin in bins:
+        currentX = bin
+        sumXX += currentX ** 2
+        sumXY += currentX * scaled_dB_values[bin]
+
+    sXX = sumXX - ((sumX * sumX) / len(bins))
+    sXY = sumXY - ((sumX * sumY) / len(bins))
+    spectral_tilt = sXY / sXX
+    # print(spectral_tilt)
+    return spectral_tilt
+# tilt("../creaky i.wav")
+# tilt("../normal i.wav")
+
+# tilt("../Creaky.wav")
+# tilt("../Breathy.wav")