From 10df2c3e55a5e9a57090a10c8bfbe7193e514606 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Sun, 20 Mar 2022 11:46:02 -0400 Subject: [PATCH] [+] Compute tilt --- src/formant.py | 34 +++++++++++++-------- src/spectral_tilt.py | 73 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 src/spectral_tilt.py diff --git a/src/formant.py b/src/formant.py index 63561b9..8377005 100644 --- a/src/formant.py +++ b/src/formant.py @@ -1,13 +1,13 @@ from __future__ import annotations import csv +import json import os from dataclasses import dataclass from multiprocessing import Pool from os import PathLike from pathlib import Path from typing import Iterable, Literal - import jsonpickle as jsonpickle import matplotlib.pyplot as plt import numpy @@ -18,9 +18,12 @@ import tqdm import seaborn as sns from matplotlib.patches import Patch +from spectral_tilt import tilt + ASAB = Literal['f', 'm'] COLOR_PINK = '#F5A9B8' COLOR_BLUE = '#5BCEFA' +CPU_CORES = 36 def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray: @@ -33,7 +36,6 @@ def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndar """ pitch_values = audio.to_pitch(0.01).selected_array['frequency'] formant_values = audio.to_formant_burg(0.01) - result = numpy.ndarray([len(pitch_values), 4], 'float32') for i in range(len(pitch_values)): @@ -104,17 +106,23 @@ def get_audio_paths(id_dir: Path, audio_suffix: str = 'wav') -> list[str]: return audios -def compute_vox_celeb_helper(aud_dir: str): +def compute_vox_celeb_freq(aud_dir: str): """ - Compute one audio file - - :param aud_dir: Audio file path - :return: None + Compute and save the frequency info of one audio file """ array = calculate_freq_info(parselmouth.Sound(aud_dir)) numpy.save(aud_dir, array) +def compute_vox_celeb_tilt(aud_dir: str): + """ + Compute and save the tilt info of one audio file + """ + spectral_tilt = tilt(parselmouth.Sound(aud_dir)) + with open(Path(aud_dir).with_suffix('.json'), 'w', encoding='utf-8') as f: + json.dump({'tilt': spectral_tilt}, f) + + def compute_vox_celeb(): print('Finding audio files...') queue: list[str] = [] @@ -127,8 +135,8 @@ def compute_vox_celeb(): print('Starting processing...') # Compute audio files in a cpu pool - with Pool(8) as pool: - for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_helper, queue), total=len(queue)): + with Pool(CPU_CORES) as pool: + for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_tilt, queue), total=len(queue)): pass @@ -206,7 +214,7 @@ def vox_celeb_statistics(): id_dirs = [id_dir for id, id_dir in loop_id_dirs()] # Loop through all ids - with Pool(8) as pool: + with Pool(CPU_CORES) as pool: for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)): pass @@ -255,6 +263,7 @@ def collect_statistics(): fig, ax = subplots(figsize=(10, 5)) # ax.set_xscale('log') #print(sns.load_dataset('tips')) + print("Pitch") print(calc_col_stats(f_means[:, 0])) print(calc_col_stats(m_means[:, 0])) @@ -267,6 +276,7 @@ def collect_statistics(): print("F3") print(calc_col_stats(f_means[:, 3])) print(calc_col_stats(m_means[:, 3])) + df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)}) dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)}) # data.boxplot() @@ -301,5 +311,5 @@ if __name__ == '__main__': # print(calculate_freq_info(parselmouth.Sound('../00001.wav'))) # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac'))) # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac'))) - # vox_celeb_statistics() - collect_statistics() + compute_vox_celeb() + # collect_statistics() diff --git a/src/spectral_tilt.py b/src/spectral_tilt.py new file mode 100644 index 0000000..c939cef --- /dev/null +++ b/src/spectral_tilt.py @@ -0,0 +1,73 @@ +import math +import parselmouth + +from parselmouth.praat import call + + +# https://github.com/Voice-Lab/VoiceLab/blob/2edf9678866eb5f5f230bf1578e1aa418f7f4917/Voicelab/toolkits/Voicelab/MeasureSpectralTiltNode.py +# The closer to positive the creakier it is# +def tilt(sound): + # read the sound + window_length_in_millisecs = 64 + window_length = window_length_in_millisecs / 1000 + + # Compute begin and end times, set window + end = call(sound, "Get end time") + midpoint = end / 2 + + begintime = midpoint - (window_length / 2) + endtime = midpoint + (window_length / 2) + part_to_measure = sound.extract_part(begintime, endtime) + spectrum = part_to_measure.to_spectrum() + total_bins = spectrum.get_number_of_bins() + dBValue = [] + bins = [] + + # convert spectral values to dB + for bin in range(total_bins): + bin_number = bin + 1 + realValue = spectrum.get_real_value_in_bin(bin_number) + imagValue = spectrum.get_imaginary_value_in_bin(bin_number) + rmsPower = math.sqrt((realValue ** 2) + (imagValue ** 2)) + if rmsPower <= 0: + print(f'Error: rmsPower={rmsPower}, needs to be positive!') + return None + db = 20 * (math.log10(rmsPower / 0.0002)) + dBValue.append(db) + bin_number += 1 + bins.append(bin) + + # find maximum dB value, for rescaling purposes + maxdB = max(dBValue) + mindB = min(dBValue) # this is wrong in Owren's script, where mindB = 0 + rangedB = maxdB - mindB + + # stretch the spectrum to a normalized range that matches the number of frequency values + scalingConstant = (total_bins - 1) / rangedB + scaled_dB_values = [] + for value in dBValue: + scaled_dBvalue = value + abs(mindB) + scaled_dBvalue *= scalingConstant + scaled_dB_values.append(scaled_dBvalue) + + # find slope + sumXX = 0 + sumXY = 0 + sumX = sum(bins) + sumY = sum(scaled_dB_values) + + for bin in bins: + currentX = bin + sumXX += currentX ** 2 + sumXY += currentX * scaled_dB_values[bin] + + sXX = sumXX - ((sumX * sumX) / len(bins)) + sXY = sumXY - ((sumX * sumY) / len(bins)) + spectral_tilt = sXY / sXX + # print(spectral_tilt) + return spectral_tilt +# tilt("../creaky i.wav") +# tilt("../normal i.wav") + +# tilt("../Creaky.wav") +# tilt("../Breathy.wav")