[+] Compute tilt

2022-03-20 11:46:02 -04:00
parent 9ea4acc6ba
commit 10df2c3e55
2 changed files with 95 additions and 12 deletions
@@ -1,13 +1,13 @@
 from __future__ import annotations
 import csv
 import json
 import os
 from dataclasses import dataclass
 from multiprocessing import Pool
 from os import PathLike
 from pathlib import Path
 from typing import Iterable, Literal
 import jsonpickle as jsonpickle
 import matplotlib.pyplot as plt
 import numpy
@@ -18,9 +18,12 @@ import tqdm
 import seaborn as sns
 from matplotlib.patches import Patch
 from spectral_tilt import tilt
 ASAB = Literal['f', 'm']
 COLOR_PINK = '#F5A9B8'
 COLOR_BLUE = '#5BCEFA'
 CPU_CORES = 36
 def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray:
@@ -33,7 +36,6 @@ def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndar
    """
    pitch_values = audio.to_pitch(0.01).selected_array['frequency']
    formant_values = audio.to_formant_burg(0.01)
    result = numpy.ndarray([len(pitch_values), 4], 'float32')
    for i in range(len(pitch_values)):
@@ -104,17 +106,23 @@ def get_audio_paths(id_dir: Path, audio_suffix: str = 'wav') -> list[str]:
    return audios
-def compute_vox_celeb_helper(aud_dir: str):
+def compute_vox_celeb_freq(aud_dir: str):
    """
-    Compute one audio file
+    Compute and save the frequency info of one audio file
    :param aud_dir: Audio file path
    :return: None
    """
    array = calculate_freq_info(parselmouth.Sound(aud_dir))
    numpy.save(aud_dir, array)
 def compute_vox_celeb_tilt(aud_dir: str):
    """
    Compute and save the tilt info of one audio file
    """
    spectral_tilt = tilt(parselmouth.Sound(aud_dir))
    with open(Path(aud_dir).with_suffix('.json'), 'w', encoding='utf-8') as f:
        json.dump({'tilt': spectral_tilt}, f)
 def compute_vox_celeb():
    print('Finding audio files...')
    queue: list[str] = []
@@ -127,8 +135,8 @@ def compute_vox_celeb():
    print('Starting processing...')
    # Compute audio files in a cpu pool
-    with Pool(8) as pool:
+    with Pool(CPU_CORES) as pool:
-        for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_helper, queue), total=len(queue)):
+        for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_tilt, queue), total=len(queue)):
            pass
@@ -206,7 +214,7 @@ def vox_celeb_statistics():
    id_dirs = [id_dir for id, id_dir in loop_id_dirs()]
    # Loop through all ids
-    with Pool(8) as pool:
+    with Pool(CPU_CORES) as pool:
        for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)):
            pass
@@ -255,6 +263,7 @@ def collect_statistics():
    fig, ax = subplots(figsize=(10, 5))
    # ax.set_xscale('log')
    #print(sns.load_dataset('tips'))
    print("Pitch")
    print(calc_col_stats(f_means[:, 0]))
    print(calc_col_stats(m_means[:, 0]))
@@ -267,6 +276,7 @@ def collect_statistics():
    print("F3")
    print(calc_col_stats(f_means[:, 3]))
    print(calc_col_stats(m_means[:, 3]))
    df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)})
    dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)})
    # data.boxplot()
@@ -301,5 +311,5 @@ if __name__ == '__main__':
    # print(calculate_freq_info(parselmouth.Sound('../00001.wav')))
    # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac')))
    # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac')))
-    # vox_celeb_statistics()
+    compute_vox_celeb()
-    collect_statistics()
+    # collect_statistics()
@@ -0,0 +1,73 @@
 import math
 import parselmouth
 from parselmouth.praat import call
 # https://github.com/Voice-Lab/VoiceLab/blob/2edf9678866eb5f5f230bf1578e1aa418f7f4917/Voicelab/toolkits/Voicelab/MeasureSpectralTiltNode.py
 # The closer to positive the creakier it is#
 def tilt(sound):
    # read the sound
    window_length_in_millisecs = 64
    window_length = window_length_in_millisecs / 1000
    # Compute begin and end times, set window
    end = call(sound, "Get end time")
    midpoint = end / 2
    begintime = midpoint - (window_length / 2)
    endtime = midpoint + (window_length / 2)
    part_to_measure = sound.extract_part(begintime, endtime)
    spectrum = part_to_measure.to_spectrum()
    total_bins = spectrum.get_number_of_bins()
    dBValue = []
    bins = []
    # convert spectral values to dB
    for bin in range(total_bins):
        bin_number = bin + 1
        realValue = spectrum.get_real_value_in_bin(bin_number)
        imagValue = spectrum.get_imaginary_value_in_bin(bin_number)
        rmsPower = math.sqrt((realValue ** 2) + (imagValue ** 2))
        if rmsPower <= 0:
            print(f'Error: rmsPower={rmsPower}, needs to be positive!')
            return None
        db = 20 * (math.log10(rmsPower / 0.0002))
        dBValue.append(db)
        bin_number += 1
        bins.append(bin)
    # find maximum dB value, for rescaling purposes
    maxdB = max(dBValue)
    mindB = min(dBValue)  # this is wrong in Owren's script, where mindB = 0
    rangedB = maxdB - mindB
    # stretch the spectrum to a normalized range that matches the number of frequency values
    scalingConstant = (total_bins - 1) / rangedB
    scaled_dB_values = []
    for value in dBValue:
        scaled_dBvalue = value + abs(mindB)
        scaled_dBvalue *= scalingConstant
        scaled_dB_values.append(scaled_dBvalue)
    # find slope
    sumXX = 0
    sumXY = 0
    sumX = sum(bins)
    sumY = sum(scaled_dB_values)
    for bin in bins:
        currentX = bin
        sumXX += currentX ** 2
        sumXY += currentX * scaled_dB_values[bin]
    sXX = sumXX - ((sumX * sumX) / len(bins))
    sXY = sumXY - ((sumX * sumY) / len(bins))
    spectral_tilt = sXY / sXX
    # print(spectral_tilt)
    return spectral_tilt
 # tilt("../creaky i.wav")
 # tilt("../normal i.wav")
 # tilt("../Creaky.wav")
 # tilt("../Breathy.wav")