[+] Compute tilt

This commit is contained in:
Hykilpikonna
2022-03-20 11:46:02 -04:00
parent 9ea4acc6ba
commit 10df2c3e55
2 changed files with 95 additions and 12 deletions
+22 -12
View File
@@ -1,13 +1,13 @@
from __future__ import annotations from __future__ import annotations
import csv import csv
import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from multiprocessing import Pool from multiprocessing import Pool
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
from typing import Iterable, Literal from typing import Iterable, Literal
import jsonpickle as jsonpickle import jsonpickle as jsonpickle
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy import numpy
@@ -18,9 +18,12 @@ import tqdm
import seaborn as sns import seaborn as sns
from matplotlib.patches import Patch from matplotlib.patches import Patch
from spectral_tilt import tilt
ASAB = Literal['f', 'm'] ASAB = Literal['f', 'm']
COLOR_PINK = '#F5A9B8' COLOR_PINK = '#F5A9B8'
COLOR_BLUE = '#5BCEFA' COLOR_BLUE = '#5BCEFA'
CPU_CORES = 36
def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray: def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray:
@@ -33,7 +36,6 @@ def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndar
""" """
pitch_values = audio.to_pitch(0.01).selected_array['frequency'] pitch_values = audio.to_pitch(0.01).selected_array['frequency']
formant_values = audio.to_formant_burg(0.01) formant_values = audio.to_formant_burg(0.01)
result = numpy.ndarray([len(pitch_values), 4], 'float32') result = numpy.ndarray([len(pitch_values), 4], 'float32')
for i in range(len(pitch_values)): for i in range(len(pitch_values)):
@@ -104,17 +106,23 @@ def get_audio_paths(id_dir: Path, audio_suffix: str = 'wav') -> list[str]:
return audios return audios
def compute_vox_celeb_helper(aud_dir: str): def compute_vox_celeb_freq(aud_dir: str):
""" """
Compute one audio file Compute and save the frequency info of one audio file
:param aud_dir: Audio file path
:return: None
""" """
array = calculate_freq_info(parselmouth.Sound(aud_dir)) array = calculate_freq_info(parselmouth.Sound(aud_dir))
numpy.save(aud_dir, array) numpy.save(aud_dir, array)
def compute_vox_celeb_tilt(aud_dir: str):
"""
Compute and save the tilt info of one audio file
"""
spectral_tilt = tilt(parselmouth.Sound(aud_dir))
with open(Path(aud_dir).with_suffix('.json'), 'w', encoding='utf-8') as f:
json.dump({'tilt': spectral_tilt}, f)
def compute_vox_celeb(): def compute_vox_celeb():
print('Finding audio files...') print('Finding audio files...')
queue: list[str] = [] queue: list[str] = []
@@ -127,8 +135,8 @@ def compute_vox_celeb():
print('Starting processing...') print('Starting processing...')
# Compute audio files in a cpu pool # Compute audio files in a cpu pool
with Pool(8) as pool: with Pool(CPU_CORES) as pool:
for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_helper, queue), total=len(queue)): for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_tilt, queue), total=len(queue)):
pass pass
@@ -206,7 +214,7 @@ def vox_celeb_statistics():
id_dirs = [id_dir for id, id_dir in loop_id_dirs()] id_dirs = [id_dir for id, id_dir in loop_id_dirs()]
# Loop through all ids # Loop through all ids
with Pool(8) as pool: with Pool(CPU_CORES) as pool:
for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)): for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)):
pass pass
@@ -255,6 +263,7 @@ def collect_statistics():
fig, ax = subplots(figsize=(10, 5)) fig, ax = subplots(figsize=(10, 5))
# ax.set_xscale('log') # ax.set_xscale('log')
#print(sns.load_dataset('tips')) #print(sns.load_dataset('tips'))
print("Pitch") print("Pitch")
print(calc_col_stats(f_means[:, 0])) print(calc_col_stats(f_means[:, 0]))
print(calc_col_stats(m_means[:, 0])) print(calc_col_stats(m_means[:, 0]))
@@ -267,6 +276,7 @@ def collect_statistics():
print("F3") print("F3")
print(calc_col_stats(f_means[:, 3])) print(calc_col_stats(f_means[:, 3]))
print(calc_col_stats(m_means[:, 3])) print(calc_col_stats(m_means[:, 3]))
df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)}) df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)})
dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)}) dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)})
# data.boxplot() # data.boxplot()
@@ -301,5 +311,5 @@ if __name__ == '__main__':
# print(calculate_freq_info(parselmouth.Sound('../00001.wav'))) # print(calculate_freq_info(parselmouth.Sound('../00001.wav')))
# print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac'))) # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac')))
# print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac'))) # print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac')))
# vox_celeb_statistics() compute_vox_celeb()
collect_statistics() # collect_statistics()
+73
View File
@@ -0,0 +1,73 @@
import math
import parselmouth
from parselmouth.praat import call
# https://github.com/Voice-Lab/VoiceLab/blob/2edf9678866eb5f5f230bf1578e1aa418f7f4917/Voicelab/toolkits/Voicelab/MeasureSpectralTiltNode.py
# The closer to positive the creakier it is#
def tilt(sound):
# read the sound
window_length_in_millisecs = 64
window_length = window_length_in_millisecs / 1000
# Compute begin and end times, set window
end = call(sound, "Get end time")
midpoint = end / 2
begintime = midpoint - (window_length / 2)
endtime = midpoint + (window_length / 2)
part_to_measure = sound.extract_part(begintime, endtime)
spectrum = part_to_measure.to_spectrum()
total_bins = spectrum.get_number_of_bins()
dBValue = []
bins = []
# convert spectral values to dB
for bin in range(total_bins):
bin_number = bin + 1
realValue = spectrum.get_real_value_in_bin(bin_number)
imagValue = spectrum.get_imaginary_value_in_bin(bin_number)
rmsPower = math.sqrt((realValue ** 2) + (imagValue ** 2))
if rmsPower <= 0:
print(f'Error: rmsPower={rmsPower}, needs to be positive!')
return None
db = 20 * (math.log10(rmsPower / 0.0002))
dBValue.append(db)
bin_number += 1
bins.append(bin)
# find maximum dB value, for rescaling purposes
maxdB = max(dBValue)
mindB = min(dBValue) # this is wrong in Owren's script, where mindB = 0
rangedB = maxdB - mindB
# stretch the spectrum to a normalized range that matches the number of frequency values
scalingConstant = (total_bins - 1) / rangedB
scaled_dB_values = []
for value in dBValue:
scaled_dBvalue = value + abs(mindB)
scaled_dBvalue *= scalingConstant
scaled_dB_values.append(scaled_dBvalue)
# find slope
sumXX = 0
sumXY = 0
sumX = sum(bins)
sumY = sum(scaled_dB_values)
for bin in bins:
currentX = bin
sumXX += currentX ** 2
sumXY += currentX * scaled_dB_values[bin]
sXX = sumXX - ((sumX * sumX) / len(bins))
sXY = sumXY - ((sumX * sumY) / len(bins))
spectral_tilt = sXY / sXX
# print(spectral_tilt)
return spectral_tilt
# tilt("../creaky i.wav")
# tilt("../normal i.wav")
# tilt("../Creaky.wav")
# tilt("../Breathy.wav")