[+] Create validation script for CN-Celeb

2021-12-23 15:07:42 -05:00
parent 4c8c43b8ef
commit 963e50dfbc
2 changed files with 78 additions and 3 deletions
@@ -138,12 +138,12 @@ def draw_result(file: str, result: Result):
    return to_wav(file, wav_callback)


-def get_result_percentages(result: Result) -> tuple[float, float, float]:
+def get_result_percentages(result: Result) -> tuple[float, float, float, float]:
    """
    Get percentages

    :param result: Result
-    :return: %female, %male, %other
+    :return: %female, %male, %other, %female-vs-female+male
    """
    # Count total and categorical durations
    total_dur = 0
@@ -160,7 +160,11 @@ def get_result_percentages(result: Result) -> tuple[float, float, float]:
    # Return results
    f = durations.get('female', 0)
    m = durations.get('male', 0)
-    return f, m, 1 - f - m
+
+    fm_total = f + m
+    pf = 0 if fm_total == 0 else f / fm_total
+
+    return f, m, 1 - f - m, pf


 def test():
@@ -0,0 +1,71 @@
+import json
+import os
+import warnings
+from pathlib import Path
+
+import numpy as np
+from inaSpeechSegmenter import Segmenter
+
+from ina_main import process, get_result_percentages
+
+
+def segment_all():
+    # Create segmenter
+    seg = Segmenter()
+    np.seterr(invalid='ignore')
+
+    # Loop through all celebrities
+    ids = [id for id in os.listdir(data_dir) if id.startswith('id')]
+    for id in ids:
+        id_dir = data_dir.joinpath(id)
+
+        # Loop through all recordings (Exclude singing for now)
+        utters = [r for r in os.listdir(id_dir) if r.endswith('.flac')
+                  and not r.startswith('singing')]
+
+        # Exclude existing
+        utters = [id_dir.joinpath(u) for u in utters]
+        utters = [u for u in utters if not u.with_suffix('.json').exists()]
+
+        if len(utters) == 0:
+            continue
+
+        # Analyze
+        results = process(seg, [str(u) for u in utters], verbose=True)
+
+        # Write results
+        total = [0, 0, 0, 0, 0]
+        type_totals = {}
+        for result in results.results:
+            file = Path(result.file).with_suffix('.json')
+
+            # Get results
+            # f: Frames, r: Ratios
+            ratios = [round(r, 3) for r in get_result_percentages(result)]
+            stored = {'f': result.frames, 'r': ratios}
+
+            # Count type total (type_totals[utter_type][-1] is the count)
+            file_name = file.name
+            utter_type = file_name[:file_name.index('-')]
+            type_totals.setdefault(utter_type, [0, 0, 0, 0, 0])
+            for i in range(4):
+                type_totals[utter_type][i] += ratios[i]
+                total[i] += ratios[i]
+            type_totals[utter_type][-1] += 1
+            total[-1] += 1
+
+            # Write result
+            file.write_text(json.dumps(stored))
+
+        # Write type averages
+        type_averages = {t: [r / type_totals[t][-1] for r in type_totals[t][:-1]] for t in type_totals}
+        total_average = [r / total[-1] for r in total[:-1]]
+        obj = {'type_averages': type_averages, 'total_averages': total_average}
+        id_dir.joinpath('total.json').write_text(json.dumps(obj))
+
+
+if __name__ == '__main__':
+    cn_celeb_root = Path('C:/Users/me/Workspace/Data/CN-Celeb_flac')
+    data_dir = cn_celeb_root.joinpath('data')
+
+    segment_all()