[+] Compute tilt
This commit is contained in:
+22
-12
@@ -1,13 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Pool
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Literal
|
||||
|
||||
import jsonpickle as jsonpickle
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy
|
||||
@@ -18,9 +18,12 @@ import tqdm
|
||||
import seaborn as sns
|
||||
from matplotlib.patches import Patch
|
||||
|
||||
from spectral_tilt import tilt
|
||||
|
||||
ASAB = Literal['f', 'm']
|
||||
COLOR_PINK = '#F5A9B8'
|
||||
COLOR_BLUE = '#5BCEFA'
|
||||
CPU_CORES = 36
|
||||
|
||||
|
||||
def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndarray:
|
||||
@@ -33,7 +36,6 @@ def calculate_freq_info(audio: parselmouth.Sound, show_plot=False) -> numpy.ndar
|
||||
"""
|
||||
pitch_values = audio.to_pitch(0.01).selected_array['frequency']
|
||||
formant_values = audio.to_formant_burg(0.01)
|
||||
|
||||
result = numpy.ndarray([len(pitch_values), 4], 'float32')
|
||||
|
||||
for i in range(len(pitch_values)):
|
||||
@@ -104,17 +106,23 @@ def get_audio_paths(id_dir: Path, audio_suffix: str = 'wav') -> list[str]:
|
||||
return audios
|
||||
|
||||
|
||||
def compute_vox_celeb_helper(aud_dir: str):
|
||||
def compute_vox_celeb_freq(aud_dir: str):
|
||||
"""
|
||||
Compute one audio file
|
||||
|
||||
:param aud_dir: Audio file path
|
||||
:return: None
|
||||
Compute and save the frequency info of one audio file
|
||||
"""
|
||||
array = calculate_freq_info(parselmouth.Sound(aud_dir))
|
||||
numpy.save(aud_dir, array)
|
||||
|
||||
|
||||
def compute_vox_celeb_tilt(aud_dir: str):
|
||||
"""
|
||||
Compute and save the tilt info of one audio file
|
||||
"""
|
||||
spectral_tilt = tilt(parselmouth.Sound(aud_dir))
|
||||
with open(Path(aud_dir).with_suffix('.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump({'tilt': spectral_tilt}, f)
|
||||
|
||||
|
||||
def compute_vox_celeb():
|
||||
print('Finding audio files...')
|
||||
queue: list[str] = []
|
||||
@@ -127,8 +135,8 @@ def compute_vox_celeb():
|
||||
print('Starting processing...')
|
||||
|
||||
# Compute audio files in a cpu pool
|
||||
with Pool(8) as pool:
|
||||
for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_helper, queue), total=len(queue)):
|
||||
with Pool(CPU_CORES) as pool:
|
||||
for _ in tqdm.tqdm(pool.imap(compute_vox_celeb_tilt, queue), total=len(queue)):
|
||||
pass
|
||||
|
||||
|
||||
@@ -206,7 +214,7 @@ def vox_celeb_statistics():
|
||||
id_dirs = [id_dir for id, id_dir in loop_id_dirs()]
|
||||
|
||||
# Loop through all ids
|
||||
with Pool(8) as pool:
|
||||
with Pool(CPU_CORES) as pool:
|
||||
for _ in tqdm.tqdm(pool.imap(vox_celeb_statistics_helper, id_dirs), total=len(id_dirs)):
|
||||
pass
|
||||
|
||||
@@ -255,6 +263,7 @@ def collect_statistics():
|
||||
fig, ax = subplots(figsize=(10, 5))
|
||||
# ax.set_xscale('log')
|
||||
#print(sns.load_dataset('tips'))
|
||||
|
||||
print("Pitch")
|
||||
print(calc_col_stats(f_means[:, 0]))
|
||||
print(calc_col_stats(m_means[:, 0]))
|
||||
@@ -267,6 +276,7 @@ def collect_statistics():
|
||||
print("F3")
|
||||
print(calc_col_stats(f_means[:, 3]))
|
||||
print(calc_col_stats(m_means[:, 3]))
|
||||
|
||||
df = pd.DataFrame({headers[i]: f_means[:, i] for i in range(4)})
|
||||
dm = pd.DataFrame({headers[i]: m_means[:, i] for i in range(4)})
|
||||
# data.boxplot()
|
||||
@@ -301,5 +311,5 @@ if __name__ == '__main__':
|
||||
# print(calculate_freq_info(parselmouth.Sound('../00001.wav')))
|
||||
# print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Extract-Z-44kHz.flac')))
|
||||
# print(calculate_freq_info(parselmouth.Sound('D:/Downloads/Vowels-Azalea.flac')))
|
||||
# vox_celeb_statistics()
|
||||
collect_statistics()
|
||||
compute_vox_celeb()
|
||||
# collect_statistics()
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
import math
|
||||
import parselmouth
|
||||
|
||||
from parselmouth.praat import call
|
||||
|
||||
|
||||
# https://github.com/Voice-Lab/VoiceLab/blob/2edf9678866eb5f5f230bf1578e1aa418f7f4917/Voicelab/toolkits/Voicelab/MeasureSpectralTiltNode.py
|
||||
# The closer to positive the creakier it is#
|
||||
def tilt(sound):
|
||||
# read the sound
|
||||
window_length_in_millisecs = 64
|
||||
window_length = window_length_in_millisecs / 1000
|
||||
|
||||
# Compute begin and end times, set window
|
||||
end = call(sound, "Get end time")
|
||||
midpoint = end / 2
|
||||
|
||||
begintime = midpoint - (window_length / 2)
|
||||
endtime = midpoint + (window_length / 2)
|
||||
part_to_measure = sound.extract_part(begintime, endtime)
|
||||
spectrum = part_to_measure.to_spectrum()
|
||||
total_bins = spectrum.get_number_of_bins()
|
||||
dBValue = []
|
||||
bins = []
|
||||
|
||||
# convert spectral values to dB
|
||||
for bin in range(total_bins):
|
||||
bin_number = bin + 1
|
||||
realValue = spectrum.get_real_value_in_bin(bin_number)
|
||||
imagValue = spectrum.get_imaginary_value_in_bin(bin_number)
|
||||
rmsPower = math.sqrt((realValue ** 2) + (imagValue ** 2))
|
||||
if rmsPower <= 0:
|
||||
print(f'Error: rmsPower={rmsPower}, needs to be positive!')
|
||||
return None
|
||||
db = 20 * (math.log10(rmsPower / 0.0002))
|
||||
dBValue.append(db)
|
||||
bin_number += 1
|
||||
bins.append(bin)
|
||||
|
||||
# find maximum dB value, for rescaling purposes
|
||||
maxdB = max(dBValue)
|
||||
mindB = min(dBValue) # this is wrong in Owren's script, where mindB = 0
|
||||
rangedB = maxdB - mindB
|
||||
|
||||
# stretch the spectrum to a normalized range that matches the number of frequency values
|
||||
scalingConstant = (total_bins - 1) / rangedB
|
||||
scaled_dB_values = []
|
||||
for value in dBValue:
|
||||
scaled_dBvalue = value + abs(mindB)
|
||||
scaled_dBvalue *= scalingConstant
|
||||
scaled_dB_values.append(scaled_dBvalue)
|
||||
|
||||
# find slope
|
||||
sumXX = 0
|
||||
sumXY = 0
|
||||
sumX = sum(bins)
|
||||
sumY = sum(scaled_dB_values)
|
||||
|
||||
for bin in bins:
|
||||
currentX = bin
|
||||
sumXX += currentX ** 2
|
||||
sumXY += currentX * scaled_dB_values[bin]
|
||||
|
||||
sXX = sumXX - ((sumX * sumX) / len(bins))
|
||||
sXY = sumXY - ((sumX * sumY) / len(bins))
|
||||
spectral_tilt = sXY / sXX
|
||||
# print(spectral_tilt)
|
||||
return spectral_tilt
|
||||
# tilt("../creaky i.wav")
|
||||
# tilt("../normal i.wav")
|
||||
|
||||
# tilt("../Creaky.wav")
|
||||
# tilt("../Breathy.wav")
|
||||
Reference in New Issue
Block a user