Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bdb36bc4a4 | |||
| b0df9d73b1 |
-10
@@ -1,10 +0,0 @@
|
||||
__pycache__/
|
||||
|
||||
data/*.*
|
||||
!data/examples/
|
||||
output/*.*
|
||||
tracking_model.dat
|
||||
tracking_model.dat.gz
|
||||
temp/*.*
|
||||
|
||||
.idea
|
||||
@@ -1,7 +1,9 @@
|
||||
# DeepFormants - PyTorch
|
||||
DeepFormants
|
||||
============
|
||||
|
||||
Shua Dissen (shua.dissen@gmail.com)
|
||||
Joseph Keshet (joseph.keshet@biu.ac.il)
|
||||
Joseph Keshet (joseph.keshet@biu.ac.il)
|
||||
|
||||
|
||||
DeepFormants is a software package for formant tracking and estimation, using two algorithms based on deep networks. It works as follows:
|
||||
* The user provides a wav file with an initial stop consonant.
|
||||
@@ -12,6 +14,8 @@ DeepFormants is a software package for formant tracking and estimation, using tw
|
||||
|
||||
This is a beta version of DeepFormants. Any reports of bugs, comments on how to improve the software or documentation, or questions are greatly appreciated, and should be sent to the authors at the addresses given above.
|
||||
|
||||
---
|
||||
|
||||
|
||||
## Installation instructions
|
||||
|
||||
@@ -20,7 +24,9 @@ Download the code. The code is based on signal processing package in Python call
|
||||
Dependencies:
|
||||
Run these lines in a terminal to install everything necessary for feature extraction.
|
||||
```
|
||||
sudo apt-get install python3-numpy python3-scipy python3-nose
|
||||
sudo apt-get install python-numpy python-scipy python-nose python-pip
|
||||
|
||||
sudo pip install scikits.talkbox
|
||||
```
|
||||
Next for the installation of Torch for loading the models run this.
|
||||
```
|
||||
@@ -31,31 +37,30 @@ cd ~/torch; bash install-deps;
|
||||
./install.sh
|
||||
```
|
||||
```
|
||||
git clone https://github.com/Element-Research/rnn.git old-rnn
|
||||
cd old-rnn; luarocks make rocks/rnn-scm-1.rockspec
|
||||
luarocks install rnn
|
||||
```
|
||||
The Estimation model can be downloaded here and because of size constraints the Tracking model can be obtained by download from this link:
|
||||
[tracking_model.mat](https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc)
|
||||
The Estimation model can be downloaded here and because of size constraints the Tracking model can be abtained by download from this link
|
||||
[tracking_model.dat.gz](https://drive.google.com/open?id=1-BwlbbHykIV52c-SL1ofcppxZ5pTTXai)
|
||||
|
||||
## How to use:
|
||||
|
||||
For vowel formant estimation, call the main script in a terminal with the following inputs: wav file, formant output filename, and the vowel begin and end times:
|
||||
|
||||
```
|
||||
python3 formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
|
||||
python formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
|
||||
```
|
||||
|
||||
or the vowel begin and end times can be taken from a TextGrid file (here the name of the TextGrid is Example.TextGrid and the vowel is taken from a tier called "VOWEL"):
|
||||
|
||||
```
|
||||
python3 formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
|
||||
python formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
|
||||
--textgrid_tier VOWEL
|
||||
```
|
||||
|
||||
For formant tracking, just call the script with the wav file and output filename:
|
||||
|
||||
```
|
||||
python3 formants.py data/Example.wav data/ExamplePredictions.csv
|
||||
python formants.py data/Example.wav data/ExamplePredictions.csv
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
__author__ = 'jkeshet'
|
||||
Binary file not shown.
@@ -0,0 +1,2 @@
|
||||
NAME,F1,F2,F3,F4
|
||||
Example.wav,0.44537545612545,1.6422395494712,2.4786549639365,3.7613720611662
|
||||
|
@@ -1,2 +0,0 @@
|
||||
NAME,F1,F2,F3,F4
|
||||
data/Example.wav,445.3781247138977,1642.2462463378906,2478.6531925201416,3761.37638092041
|
||||
|
@@ -1,232 +0,0 @@
|
||||
NAME,F1,F2,F3,F4
|
||||
data/Example.wav0,597.52205346157,1613.615825947,2511.8207285284,3755.1438757858
|
||||
data/Example.wav1,695.08480385345,1702.1706191558,2596.4426788053,3798.3721538423
|
||||
data/Example.wav2,726.79075772661,1730.6156241641,2529.5339933464,3807.384140635
|
||||
data/Example.wav3,671.77748170461,1823.1959179265,2642.5386767146,3921.9318337055
|
||||
data/Example.wav4,577.26908479146,1658.9374345583,2507.1805413575,3614.0162748738
|
||||
data/Example.wav5,550.50722496918,1354.6169786772,2112.3333772251,2942.3889707506
|
||||
data/Example.wav6,670.04808527628,1721.7457206609,2504.707938971,3639.9432310079
|
||||
data/Example.wav7,684.92226190631,1890.1166920604,2607.1436989061,3881.6625485381
|
||||
data/Example.wav8,686.580095755,2006.1529795143,2669.1368249138,3930.0540770768
|
||||
data/Example.wav9,704.616588865,2082.8735492951,2725.6508141261,3979.9081525148
|
||||
data/Example.wav10,718.34596795837,2129.5719518295,2778.9886243771,3911.4009284072
|
||||
data/Example.wav11,738.12104125024,2196.3921107039,2830.6141101685,3945.1600780317
|
||||
data/Example.wav12,747.04236867145,2237.8571572583,2865.1145994638,3983.2438101659
|
||||
data/Example.wav13,757.52185658947,2262.8835036017,2894.0268377733,4032.0698122443
|
||||
data/Example.wav14,794.56202137723,2292.9250146918,2904.0583816242,4064.956479487
|
||||
data/Example.wav15,790.79377531938,2305.3923552752,2914.8010797784,4102.8657742226
|
||||
data/Example.wav16,754.2014691902,2296.2740822086,2918.6506819728,4093.5949673686
|
||||
data/Example.wav17,753.61338439514,2305.5901354885,2941.4827120608,4114.9736720268
|
||||
data/Example.wav18,751.33215593808,2303.9278843335,2963.7034688043,4080.8958987697
|
||||
data/Example.wav19,751.74861908763,2294.1665486638,2969.1708133022,4052.0389260712
|
||||
data/Example.wav20,752.20384863157,2263.3609611672,2960.7171653614,4032.3822222124
|
||||
data/Example.wav21,770.17882943915,2258.3641355602,2947.7116753323,4020.992568346
|
||||
data/Example.wav22,781.72306969874,2275.7714060748,2968.5430522874,4073.3884117019
|
||||
data/Example.wav23,767.10101225833,2285.7353288563,2991.478364211,4105.5821065336
|
||||
data/Example.wav24,713.05002492772,2258.4466922682,2990.0470727972,4115.3480684532
|
||||
data/Example.wav25,611.7969034471,2167.9040942168,2922.8273892721,3995.5696686444
|
||||
data/Example.wav26,464.79210160717,2216.9364941434,2902.6787165574,3787.1029546751
|
||||
data/Example.wav27,392.65101221768,2241.0843289313,2882.2946023061,3729.1100713992
|
||||
data/Example.wav28,356.80962181559,2247.3252960538,2868.0795793285,3702.8591416446
|
||||
data/Example.wav29,336.9676772927,2249.0793891123,2856.1652102349,3688.9097495654
|
||||
data/Example.wav30,326.31158629661,2247.0007064468,2847.416021249,3678.3969558777
|
||||
data/Example.wav31,317.02540438115,2246.1378049692,2837.0713463389,3668.4663509487
|
||||
data/Example.wav32,316.79609159736,2238.2372739626,2824.6751485046,3658.6574288876
|
||||
data/Example.wav33,311.81580759928,2233.9672603427,2811.5325892632,3650.0889295259
|
||||
data/Example.wav34,309.75648639653,2222.1951339059,2790.4140506337,3637.495284395
|
||||
data/Example.wav35,303.11808105036,2197.8377902068,2759.3459658566,3621.4651632683
|
||||
data/Example.wav36,300.07752292403,2139.6717590425,2716.2862832378,3597.1403776907
|
||||
data/Example.wav37,297.80619082987,2029.1436415698,2666.5071300739,3593.0232729185
|
||||
data/Example.wav38,302.76576287517,1883.6037448571,2632.7944209903,3566.5764758258
|
||||
data/Example.wav39,310.58715857994,1695.7093762879,2601.2359905618,3578.9921031022
|
||||
data/Example.wav40,329.36107592013,1506.8001665802,2566.4777731297,3574.0241337336
|
||||
data/Example.wav41,333.62712502459,1334.9304246222,2537.2661992439,3615.8905250384
|
||||
data/Example.wav42,343.92517617242,1257.8469712718,2548.6360602514,3608.753460545
|
||||
data/Example.wav43,355.72406454493,1266.9462591767,2555.762409923,3563.6341640243
|
||||
data/Example.wav44,353.35995469911,1295.7360334708,2541.8108288522,3537.0785332541
|
||||
data/Example.wav45,360.10719906727,1333.1447032372,2550.7381205563,3524.8234949992
|
||||
data/Example.wav46,358.56568716995,1421.9066811358,2569.8299617768,3551.4837671167
|
||||
data/Example.wav47,357.76589312374,1508.5752248245,2581.5208384623,3571.1153114675
|
||||
data/Example.wav48,361.02168734969,1578.8816475878,2595.1474893025,3581.7115897268
|
||||
data/Example.wav49,359.07561862133,1644.8613659186,2608.8160358409,3585.0460760813
|
||||
data/Example.wav50,352.62221542712,1695.4771441192,2607.0448130886,3594.1609116241
|
||||
data/Example.wav51,363.93994626501,1711.5051886296,2624.5562096012,3569.9529344004
|
||||
data/Example.wav52,282.14832173631,1635.5238860263,2662.5388733105,3673.039938999
|
||||
data/Example.wav53,238.01956608046,1685.143014879,2681.7911930688,3747.4362196863
|
||||
data/Example.wav54,270.38958334904,1674.099569196,2620.586370391,3813.3469193242
|
||||
data/Example.wav55,287.42519287375,1656.0382437507,2548.1687198886,3885.8811197819
|
||||
data/Example.wav56,297.50982220042,1646.5536123116,2490.0838905831,4001.8684884366
|
||||
data/Example.wav57,269.0833478587,1655.0781624051,2533.8535493795,4098.626102004
|
||||
data/Example.wav58,291.68328316815,1563.9679575743,2514.3394114724,4068.4713671901
|
||||
data/Example.wav59,373.26591074927,1470.0576108226,2536.1465738964,3887.3007630968
|
||||
data/Example.wav60,464.84084023832,1476.0970324231,2596.3413545437,3802.4460913542
|
||||
data/Example.wav61,532.78835221886,1503.9007128017,2676.9189270645,3773.1451371845
|
||||
data/Example.wav62,576.16842542913,1521.9442378943,2715.5872800012,3795.4597172608
|
||||
data/Example.wav63,612.9538586962,1510.2095985565,2711.3466980751,3787.8266496721
|
||||
data/Example.wav64,629.38634961618,1492.0035227392,2758.1039316015,3813.3692044249
|
||||
data/Example.wav65,632.21786673436,1503.6049091858,2728.8614765733,3829.6745931041
|
||||
data/Example.wav66,628.76514164972,1399.37905218,2727.178823681,3732.1010399703
|
||||
data/Example.wav67,636.82812175669,1382.5526600019,2700.6454369069,3741.2410738449
|
||||
data/Example.wav68,644.06202835304,1402.8833288655,2678.6612844051,3763.5773931668
|
||||
data/Example.wav69,665.04075110941,1455.4049891716,2678.1758807676,3749.7199742507
|
||||
data/Example.wav70,663.02738666806,1495.5861490809,2605.1125468193,3906.3056119887
|
||||
data/Example.wav71,666.87736727536,1492.3669814759,2548.126781982,3984.947976164
|
||||
data/Example.wav72,565.20706464525,1576.2364598879,2632.6207344777,4071.1717865148
|
||||
data/Example.wav73,515.69661558703,1637.7513729536,2663.9521213724,4089.587512229
|
||||
data/Example.wav74,503.97065378186,1665.6196537584,2674.2600076661,4094.0318435706
|
||||
data/Example.wav75,425.4988884807,1766.4652022781,2696.6257683491,4260.4826318254
|
||||
data/Example.wav76,373.34254527995,1823.0121878332,2680.0339335153,4323.5695332979
|
||||
data/Example.wav77,352.98177798978,1867.7964949578,2662.6618621315,4338.3873722444
|
||||
data/Example.wav78,394.96498686686,1871.3700048344,2652.4816853078,4438.0938852379
|
||||
data/Example.wav79,357.15002055549,1775.3943852451,2597.8223938733,4298.8671017868
|
||||
data/Example.wav80,329.50023533097,1705.4079753771,2546.4962395447,4159.7082892855
|
||||
data/Example.wav81,307.70045447983,1823.3596586369,2600.5280126313,4126.8640001054
|
||||
data/Example.wav82,331.85469389276,1858.1179130077,2641.2121635204,4074.3609971156
|
||||
data/Example.wav83,362.95343041272,1890.9928289307,2682.2890612514,4041.5172878108
|
||||
data/Example.wav84,395.24816807451,1914.7150588898,2723.9607510773,3988.2281709193
|
||||
data/Example.wav85,411.2056449606,1929.6787431283,2747.8093509512,3937.973080989
|
||||
data/Example.wav86,421.20895459838,1931.8285764322,2762.7062554099,3864.7288960853
|
||||
data/Example.wav87,421.43928593408,1931.0866933353,2757.3828694265,3843.4370860218
|
||||
data/Example.wav88,438.84117644341,1972.4723949097,2762.0587333307,3821.7816269103
|
||||
data/Example.wav89,460.58079366567,2025.948628033,2754.6313403087,3805.124095497
|
||||
data/Example.wav90,421.62905597676,2044.0912153488,2711.1269892068,3769.3898804996
|
||||
data/Example.wav91,399.71741002399,2058.9260658802,2705.7870502926,3758.4098629294
|
||||
data/Example.wav92,416.14766913342,2037.6172281561,2677.054342556,3750.8528066943
|
||||
data/Example.wav93,430.8781057625,2009.8629319817,2639.5492023239,3682.7942428493
|
||||
data/Example.wav94,452.25319778136,1961.5882276272,2559.1694229194,3584.249952029
|
||||
data/Example.wav95,462.44628437735,1909.516047408,2455.4124318309,3545.9309872252
|
||||
data/Example.wav96,482.17313946191,1895.5672946205,2435.9499675054,3540.2498092093
|
||||
data/Example.wav97,490.35065220484,1866.8441069923,2390.5755659871,3495.2652643518
|
||||
data/Example.wav98,493.89642586283,1817.7929698775,2283.9382392473,3432.3026537516
|
||||
data/Example.wav99,499.32606963064,1768.2433574417,2182.6628391829,3404.4990461484
|
||||
data/Example.wav100,501.54606354894,1715.5962165142,2124.6538157508,3438.0197890818
|
||||
data/Example.wav101,496.8346053791,1661.3587948124,2034.4715092381,3429.0955445083
|
||||
data/Example.wav102,493.33019060232,1608.1278006391,1954.9967859222,3486.9523677942
|
||||
data/Example.wav103,490.26973081437,1584.3782120341,1903.4815374306,3501.8928142151
|
||||
data/Example.wav104,483.28591710344,1580.6280556335,1912.0731497185,3490.8593842755
|
||||
data/Example.wav105,474.44279661302,1577.0982086823,1890.4039266946,3458.2198078865
|
||||
data/Example.wav106,475.36082747879,1533.860163092,1899.7547954454,3478.3153656802
|
||||
data/Example.wav107,464.1285437072,1536.3873917434,1882.1706426847,3515.360763387
|
||||
data/Example.wav108,453.77042808842,1544.4154360464,1942.6263553674,3436.1058536443
|
||||
data/Example.wav109,450.13048837516,1594.8789733705,2047.5336956964,3498.6641428002
|
||||
data/Example.wav110,432.11177790474,1639.5515650555,2101.4367396583,3428.2132652911
|
||||
data/Example.wav111,418.74825272536,1681.9736788497,2135.8666050376,3417.2586872462
|
||||
data/Example.wav112,405.3719533601,1731.9672364294,2183.0647484563,3431.8668085449
|
||||
data/Example.wav113,392.88948911473,1786.3264020268,2210.5835715195,3438.776773868
|
||||
data/Example.wav114,373.474737724,1887.6661375527,2309.1027506154,3502.7933820456
|
||||
data/Example.wav115,347.58051484696,1989.5414702858,2441.3326833827,3581.8816356432
|
||||
data/Example.wav116,334.20452402519,2026.0437562222,2537.9952168654,3597.5159762089
|
||||
data/Example.wav117,322.56279270492,2086.2363407419,2655.2152492099,3641.7497208844
|
||||
data/Example.wav118,339.3131062837,2121.955494922,2714.5995462561,3700.8412808396
|
||||
data/Example.wav119,341.31843679468,2125.8452516914,2726.6475819345,3698.1913478513
|
||||
data/Example.wav120,360.47365940763,2083.9305855041,2714.2131982139,3698.8186274682
|
||||
data/Example.wav121,363.11877436488,2061.8058595753,2710.3114520987,3697.0097019804
|
||||
data/Example.wav122,383.06871293946,1994.4962807334,2660.309502954,3714.4305725283
|
||||
data/Example.wav123,404.08621222125,1937.6203617943,2620.6225064107,3662.4969942625
|
||||
data/Example.wav124,413.17243288101,1867.9566432618,2559.8090959119,3645.7829716702
|
||||
data/Example.wav125,420.48916436109,1745.8194458989,2462.295725465,3623.2649125885
|
||||
data/Example.wav126,419.24176322359,1661.3616989629,2402.0839806123,3609.1423656623
|
||||
data/Example.wav127,419.92208177621,1553.3690139745,2212.1644536132,3806.0747605306
|
||||
data/Example.wav128,325.96640217642,1418.2143563765,2164.0253765053,3716.7086388812
|
||||
data/Example.wav129,285.5361624526,1291.4409476962,2178.5623379032,3789.8762049936
|
||||
data/Example.wav130,257.65403995027,1218.5145593312,2178.3165376511,3860.0558330691
|
||||
data/Example.wav131,256.48646950186,1246.2088069493,2187.8836016044,3912.8975753174
|
||||
data/Example.wav132,237.48487454102,1293.1092609666,2224.7222196052,3872.7596483396
|
||||
data/Example.wav133,231.3371136992,1331.6743570151,2252.9991078919,3857.0984512447
|
||||
data/Example.wav134,268.03000592681,1261.3075436434,2215.5175249951,3863.9755245014
|
||||
data/Example.wav135,282.80289770956,1146.358518138,2148.7017687456,3863.410314359
|
||||
data/Example.wav136,296.67440492938,1112.116343058,1999.1259408069,3869.4578382495
|
||||
data/Example.wav137,311.49556238198,1135.8070917503,1863.2277852469,3755.2022809818
|
||||
data/Example.wav138,333.62361376473,1213.1358577695,1879.936459201,3689.2143210604
|
||||
data/Example.wav139,345.56211347062,1318.7162380459,1889.7137934816,3540.8924917934
|
||||
data/Example.wav140,357.40395964001,1414.5240348431,1930.3329697306,3495.1231454417
|
||||
data/Example.wav141,362.92921932599,1520.0324016194,2003.7298816535,3567.9516256789
|
||||
data/Example.wav142,366.73898023917,1601.6325991009,2059.5198495076,3526.7265538346
|
||||
data/Example.wav143,366.69584214898,1685.0005123251,2122.2527064945,3509.322238216
|
||||
data/Example.wav144,350.59280100236,1769.2580008882,2177.2287375601,3518.7642065058
|
||||
data/Example.wav145,334.56543052476,1840.6293059936,2232.970066598,3521.9529347493
|
||||
data/Example.wav146,313.7897966543,1914.7014303275,2311.8075931725,3516.1139605998
|
||||
data/Example.wav147,310.19156328487,1969.8351420049,2425.2288267891,3518.8472234063
|
||||
data/Example.wav148,302.80988516026,2047.9466571048,2553.1701031146,3568.2264549211
|
||||
data/Example.wav149,297.39580786399,2096.6441730092,2654.2971016098,3615.8641189258
|
||||
data/Example.wav150,294.27351217035,2053.2819507505,2657.4310077457,3636.8164191387
|
||||
data/Example.wav151,302.19460420595,2000.4395154592,2667.5072643267,3723.9191493342
|
||||
data/Example.wav152,347.69087739847,1942.306463958,2651.1799637746,3775.0142372833
|
||||
data/Example.wav153,357.69736615595,1854.4442801434,2623.1786215667,3789.0248299735
|
||||
data/Example.wav154,382.29771027292,1760.0695958603,2583.7109294292,3817.8953390063
|
||||
data/Example.wav155,407.54373254369,1697.0890039167,2546.0031580126,3846.0282427007
|
||||
data/Example.wav156,432.90375875753,1668.9418422266,2516.1130250608,3853.6242937275
|
||||
data/Example.wav157,455.80132156975,1673.3618988922,2481.2311712326,3829.6650658339
|
||||
data/Example.wav158,513.15710656811,1728.4942135631,2484.5784016949,3835.8401018631
|
||||
data/Example.wav159,520.4708133912,1794.997626997,2514.1493186531,3892.1599280087
|
||||
data/Example.wav160,494.31146190259,1854.4323305575,2652.8624138335,3917.9945778114
|
||||
data/Example.wav161,451.78729185753,1891.213870271,2709.0307202319,3930.0457537516
|
||||
data/Example.wav162,447.71841803382,1926.0130757949,2769.6267089388,3945.9099584482
|
||||
data/Example.wav163,443.68309101769,1934.2676213287,2832.1539144242,3994.3128903197
|
||||
data/Example.wav164,459.41504232669,1983.5730885475,2868.7306516137,3998.5916884127
|
||||
data/Example.wav165,498.50688019077,2016.6491079952,2909.549290289,3991.1430082239
|
||||
data/Example.wav166,524.0709846093,2090.4154310312,2942.0014272027,3988.8117921514
|
||||
data/Example.wav167,545.81814079133,2156.1304911801,2966.5932667563,3994.0621857005
|
||||
data/Example.wav168,521.59948795582,2190.1831364237,2951.4625773832,3996.2775347309
|
||||
data/Example.wav169,484.75209778826,2175.6299572139,2876.5191747788,3917.8445743522
|
||||
data/Example.wav170,432.82956733879,2169.8999981256,2831.9149239351,3949.9130576946
|
||||
data/Example.wav171,420.21706806093,2174.7133307308,2808.9469815551,3943.579289652
|
||||
data/Example.wav172,427.18100176143,2160.6961216577,2788.9460236025,3954.9059175778
|
||||
data/Example.wav173,452.57482114579,2114.1025676742,2743.4204477576,3905.0830217336
|
||||
data/Example.wav174,467.90226079696,2073.7511504114,2697.4529992115,3887.3859821227
|
||||
data/Example.wav175,475.92000414754,2067.6324802285,2690.4017505884,3857.96586513
|
||||
data/Example.wav176,481.68391133646,2057.0658971149,2681.265502174,3857.239891604
|
||||
data/Example.wav177,478.87145793794,2062.2789451758,2687.1571941166,3870.5494834439
|
||||
data/Example.wav178,477.20648934019,2071.4285466542,2692.923932098,3888.9648361388
|
||||
data/Example.wav179,473.8527188474,2064.982939429,2702.7435699883,3902.5763828741
|
||||
data/Example.wav180,471.57348597334,2058.0083919703,2701.6452198596,3923.3000496937
|
||||
data/Example.wav181,463.39299925515,2062.4157668808,2710.7310472846,3950.0874390583
|
||||
data/Example.wav182,436.63218811152,2108.8274305101,2740.8847893121,3974.5136217463
|
||||
data/Example.wav183,421.93668550809,2124.1204206929,2752.6751275984,4003.0863194578
|
||||
data/Example.wav184,407.25856028376,2116.5799637727,2738.019557875,4117.2982534636
|
||||
data/Example.wav185,393.64185335216,2113.1989480573,2726.6963823161,4193.5175677943
|
||||
data/Example.wav186,364.38846020433,2120.2702394571,2709.5686148431,4305.4762639002
|
||||
data/Example.wav187,349.05101567169,2126.4418179177,2703.4160071699,4336.3331639418
|
||||
data/Example.wav188,382.1411944146,2101.8596066989,2680.9964054382,4291.6578616102
|
||||
data/Example.wav189,433.02739940077,2071.5117295782,2676.9681651134,4266.5163553167
|
||||
data/Example.wav190,464.18026267061,2042.020614906,2682.063520666,4219.539518887
|
||||
data/Example.wav191,469.6691192069,2020.4696403087,2699.8016147476,4214.385167188
|
||||
data/Example.wav192,467.99189099137,1997.9256497791,2704.6732819819,4205.0484095747
|
||||
data/Example.wav193,476.35608207218,1983.4636109557,2700.874753974,4168.4855783136
|
||||
data/Example.wav194,496.53408658262,1973.9974908694,2701.934619156,4137.0760858252
|
||||
data/Example.wav195,511.19249331937,1973.7508004822,2708.4244945865,4154.2537971852
|
||||
data/Example.wav196,522.4695481406,1972.1068175467,2716.813289339,4176.2050581299
|
||||
data/Example.wav197,528.05142584229,1966.0232153203,2725.6564356728,4186.216684233
|
||||
data/Example.wav198,521.3589523206,1937.6779250972,2733.1678407162,4172.0998584209
|
||||
data/Example.wav199,516.48565828683,1912.1493631225,2741.815104888,4174.6646815625
|
||||
data/Example.wav200,508.53898698691,1889.9233802827,2758.8286360602,4180.6414309284
|
||||
data/Example.wav201,511.76952280585,1873.7655572631,2754.979982981,4170.2118052041
|
||||
data/Example.wav202,520.22408989767,1860.5267884894,2751.4742143224,4160.854769779
|
||||
data/Example.wav203,522.12352013622,1869.3167586956,2748.540173029,4163.1638460118
|
||||
data/Example.wav204,522.00266727624,1863.9117553503,2743.4282874248,4165.1485437118
|
||||
data/Example.wav205,514.22151055065,1852.8469855083,2737.6839401259,4186.2289065411
|
||||
data/Example.wav206,512.34645596931,1838.5445814886,2732.2566724166,4211.0681334926
|
||||
data/Example.wav207,513.33207524931,1826.0364732311,2731.4608610801,4236.1350611562
|
||||
data/Example.wav208,526.64008466085,1823.7476761467,2743.7288021458,4233.3588707581
|
||||
data/Example.wav209,535.46958282296,1823.5055596894,2744.8773162777,4247.2223916493
|
||||
data/Example.wav210,554.18085627576,1830.3957764248,2740.5412955579,4265.2772595636
|
||||
data/Example.wav211,570.92141936336,1841.4509122446,2722.4194699268,4300.0972838303
|
||||
data/Example.wav212,617.22315723711,1849.248172076,2718.4887666488,4344.1425718346
|
||||
data/Example.wav213,665.43736400041,1836.0769721094,2729.6398049903,4367.8183999718
|
||||
data/Example.wav214,703.70393143902,1821.333238966,2741.4009387203,4382.2141359022
|
||||
data/Example.wav215,699.44283063031,1783.2536559451,2744.8245925127,4385.9946253066
|
||||
data/Example.wav216,697.3039588797,1749.6664300252,2780.8912480565,4404.0649574703
|
||||
data/Example.wav217,693.61282467771,1728.2942367977,2825.8152590749,4416.2022041456
|
||||
data/Example.wav218,671.054268248,1706.9954675441,2861.3821068079,4409.2646596234
|
||||
data/Example.wav219,658.78280251903,1686.7204976193,2861.0405145442,4386.7229762032
|
||||
data/Example.wav220,653.45315577687,1686.2393832538,2867.4387653491,4377.3885777179
|
||||
data/Example.wav221,648.37556989138,1697.6533091722,2881.3735124079,4377.8057861327
|
||||
data/Example.wav222,665.62073336552,1703.610865382,2880.5355121903,4376.3085303946
|
||||
data/Example.wav223,641.32694371434,1692.8188376314,2888.2339355312,4378.9495329249
|
||||
data/Example.wav224,614.12719575363,1643.3612544808,2838.1975081999,4369.2224175283
|
||||
data/Example.wav225,538.90808502501,1618.465641695,2819.6834265203,4352.2339391976
|
||||
data/Example.wav226,499.70213312514,1625.594124235,2818.3129727707,4345.3237451616
|
||||
data/Example.wav227,513.68333841898,1623.2294798462,2779.8807012173,4319.464067883
|
||||
data/Example.wav228,536.53889205058,1572.8807732309,2677.3376944659,4239.7824252382
|
||||
data/Example.wav229,543.49661586518,1541.25751732,2560.5591721276,4156.2211017785
|
||||
data/Example.wav230,521.92437138737,1566.6446461785,2605.9947517995,4179.1581606505
|
||||
|
+99
-85
@@ -1,46 +1,41 @@
|
||||
__author__ = 'shua'
|
||||
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from inaSpeechSegmenter import tf_mfcc
|
||||
from inaSpeechSegmenter.features import to_wav
|
||||
from inaSpeechSegmenter.sidekit_mfcc import read_wav
|
||||
from numba import int16, njit
|
||||
from scipy.fftpack import fft
|
||||
import wave
|
||||
import os
|
||||
from os import listdir
|
||||
from os.path import isfile, join
|
||||
import math
|
||||
from scipy.fftpack.realtransforms import dct
|
||||
from scipy.signal import lfilter
|
||||
|
||||
from helpers.conch_lpc import lpc
|
||||
from scipy.signal import lfilter, hamming
|
||||
from copy import deepcopy
|
||||
from scipy.fftpack import fft, ifft
|
||||
from scikits.talkbox.linpred import lpc
|
||||
import shutil
|
||||
from helpers.utilities import *
|
||||
|
||||
epsilon = 0.0000000001
|
||||
prefac = .97
|
||||
|
||||
|
||||
def build_data_new(wav_path: str, begin: Optional[int], end: Optional[int]):
|
||||
y, sr, _ = read_wav(wav_path, dtype=np.int16)
|
||||
if begin is not None and end is not None:
|
||||
return y[begin * sr:end * sr]
|
||||
|
||||
|
||||
def build_data(wav, begin=None, end=None):
|
||||
wav_in_file = wave.Wave_read(str(wav))
|
||||
def build_data(wav,begin=None,end=None):
|
||||
wav_in_file = wave.Wave_read(wav)
|
||||
wav_in_num_samples = wav_in_file.getnframes()
|
||||
N = wav_in_file.getnframes()
|
||||
dstr = wav_in_file.readframes(N)
|
||||
data = np.fromstring(dstr, np.float32)
|
||||
data = np.fromstring(dstr, np.int16)
|
||||
if begin is not None and end is not None:
|
||||
# return data[begin*16000:end*16000] #numpy 1.11.0
|
||||
return data[np.int(begin * 16000):np.int(end * 16000)] # numpy 1.14.0
|
||||
#return data[begin*16000:end*16000] #numpy 1.11.0
|
||||
return data[np.int(begin*16000):np.int(end*16000)] #numpy 1.14.0
|
||||
X = []
|
||||
l = len(data)
|
||||
for i in range(0, l - 100, 160):
|
||||
for i in range(0, l-100, 160):
|
||||
X.append(data[i:i + 480])
|
||||
return X
|
||||
|
||||
|
||||
def periodogram(x, nfft: int, fs=1):
|
||||
def periodogram(x, nfft=None, fs=1):
|
||||
"""Compute the periodogram of the given signal, with the given fft size.
|
||||
|
||||
Parameters
|
||||
@@ -61,6 +56,15 @@ def periodogram(x, nfft: int, fs=1):
|
||||
fgrid : array-like
|
||||
Frequency grid over which the periodogram was estimated.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate a signal with two sinusoids, and compute its periodogram:
|
||||
|
||||
>>> fs = 1000
|
||||
>>> x = np.sin(2 * np.pi * 0.1 * fs * np.linspace(0, 0.5, 0.5*fs))
|
||||
>>> x += np.sin(2 * np.pi * 0.2 * fs * np.linspace(0, 0.5, 0.5*fs))
|
||||
>>> px, fx = periodogram(x, 512, fs)
|
||||
|
||||
Notes
|
||||
-----
|
||||
Only real signals supported for now.
|
||||
@@ -82,11 +86,11 @@ def periodogram(x, nfft: int, fs=1):
|
||||
if nfft < n:
|
||||
raise ValueError("nfft < signal size not supported yet")
|
||||
|
||||
pxx = np.abs(np.fft.fft(x, nfft)) ** 2
|
||||
pxx = np.abs(fft(x, nfft)) ** 2
|
||||
if nfft % 2 == 0:
|
||||
pn = nfft // 2 + 1
|
||||
pn = nfft / 2 + 1
|
||||
else:
|
||||
pn = (nfft + 1) // 2
|
||||
pn = (nfft + 1 )/ 2
|
||||
|
||||
fgrid = np.linspace(0, fs * 0.5, pn)
|
||||
return pxx[:pn] / (n * fs), fgrid
|
||||
@@ -133,9 +137,9 @@ def arspec(x, order, nfft=None, fs=1):
|
||||
|
||||
# This is not enough to deal correctly with even/odd size
|
||||
if nfft % 2 == 0:
|
||||
pn = nfft // 2 + 1
|
||||
pn = nfft / 2 + 1
|
||||
else:
|
||||
pn = (nfft + 1) // 2
|
||||
pn = (nfft + 1 )/ 2
|
||||
|
||||
px = 1 / np.fft.fft(a, nfft)[:pn]
|
||||
pxx = np.real(np.conj(px) * px)
|
||||
@@ -180,13 +184,13 @@ def atal(x, order, num_coefs):
|
||||
a, e, kk = lpc(x, order)
|
||||
c = np.zeros(num_coefs)
|
||||
c[0] = a[0]
|
||||
for m in range(1, order + 1):
|
||||
for m in range(1, order+1):
|
||||
c[m] = - a[m]
|
||||
for k in range(1, m):
|
||||
c[m] += (float(k) / float(m) - 1) * a[k] * c[m - k]
|
||||
for m in range(order + 1, num_coefs):
|
||||
for k in range(1, order + 1):
|
||||
c[m] += (float(k) / float(m) - 1) * a[k] * c[m - k]
|
||||
c[m] += (float(k)/float(m)-1)*a[k]*c[m-k]
|
||||
for m in range(order+1, num_coefs):
|
||||
for k in range(1, order+1):
|
||||
c[m] += (float(k)/float(m)-1)*a[k]*c[m-k]
|
||||
return c
|
||||
|
||||
|
||||
@@ -195,7 +199,8 @@ def preemp(input, p):
|
||||
return lfilter([1., -p], 1, input)
|
||||
|
||||
|
||||
def arspecs(input_wav, order, Atal=False):
|
||||
def arspecs(input_wav,order,Atal=False):
|
||||
epsilon = 0.0000000001
|
||||
data = input_wav
|
||||
if Atal:
|
||||
ar = atal(data, order, 30)
|
||||
@@ -204,59 +209,54 @@ def arspecs(input_wav, order, Atal=False):
|
||||
ar = []
|
||||
ars = arspec(data, order, 4096)
|
||||
for k, l in zip(ars[0], ars[1]):
|
||||
ar.append(math.log(math.sqrt((k ** 2) + (l ** 2))))
|
||||
for val in range(0, len(ar)):
|
||||
if ar[val] < 0.0:
|
||||
ar[val] = np.nan
|
||||
elif ar[val] == 0.0:
|
||||
ar[val] = 0.0000000001
|
||||
ar.append(math.log(math.sqrt((k**2)+(l**2))))
|
||||
for val in range(0,len(ar)):
|
||||
if ar[val] == 0.0:
|
||||
ar[val] = deepcopy(epsilon)
|
||||
mspec1 = np.log10(ar)
|
||||
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
|
||||
ar = dct(mspec1, type=2, norm='ortho', axis=-1)
|
||||
return ar[:30]
|
||||
|
||||
|
||||
def mfcc(sig: int16[:], pitch):
|
||||
N = len(sig)
|
||||
samps = N // pitch
|
||||
if samps == 0:
|
||||
samps = 1
|
||||
frames = N // samps
|
||||
data = sig[0:frames]
|
||||
|
||||
specs = periodogram(data, nfft=4096)
|
||||
for i in range(1, int(samps)):
|
||||
data = sig[frames * i:frames * (i + 1)]
|
||||
peri = periodogram(data, nfft=4096)
|
||||
specs[0] += peri[0]
|
||||
|
||||
specs[0] /= samps
|
||||
with np.errstate(divide='ignore'):
|
||||
peri = np.log(np.sqrt(specs[0] ** 2 + specs[1] ** 2))
|
||||
peri[np.isneginf(peri)] = 0.0000000001
|
||||
|
||||
# Filter the spectrum through the triangle filterbank
|
||||
mspec = np.log10(peri)
|
||||
|
||||
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
|
||||
ceps = dct(mspec, type=2, norm='ortho', axis=-1)
|
||||
|
||||
return ceps[:50]
|
||||
def specPS(input_wav,pitch):
|
||||
N = len(input_wav)
|
||||
samps = N/pitch
|
||||
if samps == 0:
|
||||
samps = 1
|
||||
frames = N/samps
|
||||
data = input_wav[0:frames]
|
||||
specs = periodogram(data,nfft=4096)
|
||||
for i in range(1,int(samps)):
|
||||
data = input_wav[frames*i:frames*(i+1)]
|
||||
peri = periodogram(data,nfft=4096)
|
||||
for sp in range(len(peri[0])):
|
||||
specs[0][sp] += peri[0][sp]
|
||||
for s in range(len(specs[0])):
|
||||
specs[0][s] /= float(samps)
|
||||
peri = []
|
||||
for k, l in zip(specs[0], specs[1]):
|
||||
if k == 0 and l == 0:
|
||||
peri.append(epsilon)
|
||||
else:
|
||||
peri.append(math.log(math.sqrt((k ** 2) + (l ** 2))))
|
||||
# Filter the spectrum through the triangle filterbank
|
||||
mspec = np.log10(peri)
|
||||
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
|
||||
ceps = dct(mspec, type=2, norm='ortho', axis=-1)
|
||||
return ceps[:50]
|
||||
|
||||
|
||||
def mfcc_new(sig: int16[:], pitch):
|
||||
loge, mspec = tf_mfcc.mel_spect(sig, nwin=0.256)
|
||||
ceps = dct(mspec, type=2, norm='ortho', axis=-1)
|
||||
return ceps[:50]
|
||||
|
||||
|
||||
def build_single_feature_row(data: int16[:], atal: bool = False):
|
||||
lpc_orders = np.array([8, 9, 10, 11, 12, 13, 14, 15, 16, 17])
|
||||
def build_single_feature_row(data, Atal):
|
||||
lpcs = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
|
||||
arr = []
|
||||
periodo = mfcc(data, 50)
|
||||
periodo = specPS(data, 50)
|
||||
arr.extend(periodo)
|
||||
for j in lpc_orders:
|
||||
ars = arspecs(data, j, Atal=atal)
|
||||
for j in lpcs:
|
||||
if Atal:
|
||||
ars = arspecs(data, j, Atal=True)
|
||||
else:
|
||||
ars = arspecs(data, j)
|
||||
arr.extend(ars)
|
||||
for i in range(len(arr)):
|
||||
if np.isnan(np.float(arr[i])):
|
||||
@@ -265,13 +265,13 @@ def build_single_feature_row(data: int16[:], atal: bool = False):
|
||||
|
||||
|
||||
def create_features(input_wav_filename, feature_filename, begin=None, end=None, Atal=False):
|
||||
wav = to_wav(input_wav_filename)
|
||||
X = build_data_new(wav, begin, end)
|
||||
tmp_wav16_filename = generate_tmp_filename("wav")
|
||||
easy_call("sox " + input_wav_filename + " -c 1 -r 16000 " + tmp_wav16_filename)
|
||||
X = build_data(tmp_wav16_filename, begin, end)
|
||||
if begin is not None and end is not None:
|
||||
arr = [input_wav_filename]
|
||||
arr.extend(build_single_feature_row(X, Atal))
|
||||
np.savetxt(feature_filename, np.asarray([arr]), delimiter=",", fmt="%s")
|
||||
os.remove(wav)
|
||||
return arr
|
||||
arcep_mat = []
|
||||
for i in range(len(X)):
|
||||
@@ -279,7 +279,21 @@ def create_features(input_wav_filename, feature_filename, begin=None, end=None,
|
||||
arr.extend(build_single_feature_row(X[i], Atal))
|
||||
arcep_mat.append(arr)
|
||||
np.savetxt(feature_filename, np.asarray(arcep_mat), delimiter=",", fmt="%s")
|
||||
|
||||
os.remove(wav)
|
||||
|
||||
return arcep_mat
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parse arguments
|
||||
parser = argparse.ArgumentParser(description='Extract features for formants estimation.')
|
||||
parser.add_argument('wav_file', default='', help="WAV audio filename (single vowel or an whole utternace)")
|
||||
parser.add_argument('feature_file', default='', help="output feature text file")
|
||||
parser.add_argument('--begin', help="beginning time in the WAV file", default=0.0, type=float)
|
||||
parser.add_argument('--end', help="end time in the WAV file", default=-1.0, type=float)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.begin > 0.0 or args.end > 0.0:
|
||||
create_features(args.wav_file, args.feature_file, args.begin, args.end)
|
||||
else:
|
||||
create_features(args.wav_file, args.feature_file)
|
||||
|
||||
|
||||
|
||||
+31
-38
@@ -1,31 +1,27 @@
|
||||
|
||||
import extract_features as features
|
||||
import argparse
|
||||
from helpers.textgrid import *
|
||||
from helpers.utilities import *
|
||||
from load_estimation_model import load_estimation_model
|
||||
import shutil
|
||||
|
||||
|
||||
def predict_from_times(wav_filename, preds_filename, begin, end, csv_export=True):
|
||||
tmp_features_filename = generate_tmp_filename("txt")
|
||||
#tmp_features_filename = "temp/" + next(tempfile._get_candidate_names()) + ".txt"
|
||||
print("Input Array Path: " + tmp_features_filename)
|
||||
|
||||
predictions = None
|
||||
# if begin > 0.0 or end > 0.0:
|
||||
print(wav_filename + " interval " + str(begin) + "-" + str(end) + ":")
|
||||
features.create_features(wav_filename, tmp_features_filename, begin, end)
|
||||
predictions = load_estimation_model(tmp_features_filename, preds_filename, begin, end, csv_export=csv_export)
|
||||
#easy_call("luajit load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
|
||||
# else:
|
||||
# features.create_features(wav_filename, tmp_features_filename)
|
||||
# easy_call("luajit load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)
|
||||
|
||||
delete_temp_files()
|
||||
return predictions
|
||||
def predict_from_times(wav_filename, preds_filename, begin, end):
|
||||
tmp_features_filename = tempfile._get_default_tempdir() + "/" + next(tempfile._get_candidate_names()) + ".txt"
|
||||
print tmp_features_filename
|
||||
|
||||
if begin > 0.0 or end > 0.0:
|
||||
features.create_features(wav_filename, tmp_features_filename, begin, end)
|
||||
easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
|
||||
else:
|
||||
features.create_features(wav_filename, tmp_features_filename)
|
||||
easy_call("th load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)
|
||||
|
||||
|
||||
def predict_from_textgrid(wav_filename, preds_filename, textgrid_filename, textgrid_tier):
|
||||
print(wav_filename)
|
||||
|
||||
print wav_filename
|
||||
|
||||
if os.path.exists(preds_filename):
|
||||
os.remove(preds_filename)
|
||||
@@ -38,27 +34,24 @@ def predict_from_textgrid(wav_filename, preds_filename, textgrid_filename, textg
|
||||
# extract tier names
|
||||
tier_names = textgrid.tierNames()
|
||||
|
||||
|
||||
if textgrid_tier in tier_names: # run over all intervals in the tier
|
||||
if textgrid_tier in tier_names:
|
||||
tier_index = tier_names.index(textgrid_tier)
|
||||
textgrid_tier = textgrid[tier_index]
|
||||
else: # process first tier
|
||||
textgrid_tier = textgrid[0]
|
||||
|
||||
for interval in textgrid_tier:
|
||||
if re.search(r'\S', interval.mark()):
|
||||
tmp_features_filename = generate_tmp_filename("features")
|
||||
tmp_preds = generate_tmp_filename("preds")
|
||||
begin = interval.xmin()
|
||||
end = interval.xmax()
|
||||
features.create_features(wav_filename, tmp_features_filename, begin, end)
|
||||
load_estimation_model(tmp_features_filename, tmp_preds, begin, end)
|
||||
#easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + tmp_preds)
|
||||
csv_append_row(tmp_preds, preds_filename)
|
||||
delete_temp_files()
|
||||
|
||||
delete_temp_files()
|
||||
|
||||
# run over all intervals in the tier
|
||||
for interval in textgrid[tier_index]:
|
||||
if re.search(r'\S', interval.mark()):
|
||||
tmp_features_filename = generate_tmp_filename("features")
|
||||
tmp_preds = generate_tmp_filename("preds")
|
||||
features.create_features(wav_filename, tmp_features_filename, interval.xmin(), interval.xmax())
|
||||
easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + tmp_preds)
|
||||
csv_append_row(tmp_preds, preds_filename)
|
||||
else: # process first tier
|
||||
for interval in textgrid[0]:
|
||||
if re.search(r'\S', interval.mark()):
|
||||
tmp_features_filename = generate_tmp_filename("features")
|
||||
tmp_preds = generate_tmp_filename("preds")
|
||||
features.create_features(wav_filename, tmp_features_filename, interval.xmin(), interval.xmax())
|
||||
easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + tmp_preds)
|
||||
csv_append_row(tmp_preds, preds_filename)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parse arguments
|
||||
|
||||
+2
-2
@@ -4,12 +4,12 @@ if [ $# -eq 2 ]
|
||||
then
|
||||
tempfile=`mktemp -t txt`
|
||||
python extract_features.py $1 $tempfile
|
||||
luajit load_estimation_model.lua $tempfile $2
|
||||
th load_estimation_model.lua $tempfile $2
|
||||
elif [ $# -eq 4 ]
|
||||
then
|
||||
tempfile=`mktemp -t txt`
|
||||
python extract_features.py $1 $tempfile --begin $3 --end $4
|
||||
luajit load_estimation_model.lua $tempfile $2
|
||||
th load_estimation_model.lua $tempfile $2
|
||||
else
|
||||
echo "$0 wav_filename pred_csv_filename [begin_time end_time]"
|
||||
fi
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
# This file has been copied (with minor changes) from Michael
|
||||
# McAuliffe's Conch project, to provide a compatible replacement
|
||||
# implementation of the lpc() function from the obsolete Python-2-only
|
||||
# scikits.talkbox library.
|
||||
#
|
||||
# Conch repository: https://github.com/mmcauliffe/Conch-sounds
|
||||
# Source: https://github.com/mmcauliffe/Conch-sounds/blob/master/conch/analysis/formants/lpc.py
|
||||
|
||||
# Copyright (c) 2015 Michael McAuliffe
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
#import librosa
|
||||
import librosa
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
from numba import njit
|
||||
from scipy.signal import lfilter
|
||||
|
||||
from scipy.fftpack import fft, ifft
|
||||
from scipy.signal.windows import gaussian
|
||||
|
||||
|
||||
@njit
|
||||
def next_pow_2(x: float) -> int:
|
||||
"""Return the first integer N such that 2**N >= abs(x)"""
|
||||
return np.ceil(np.log2(np.abs(x)))
|
||||
|
||||
|
||||
def lpc_ref(signal, order):
|
||||
"""Compute the Linear Prediction Coefficients.
|
||||
|
||||
Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will
|
||||
find the k+1 coefficients of a k order linear filter:
|
||||
|
||||
xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1]
|
||||
|
||||
Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
signal: array_like
|
||||
input signal
|
||||
order : int
|
||||
LPC order (the output will have order + 1 items)
|
||||
|
||||
Notes
|
||||
----
|
||||
This is just for reference, as it is using the direct inversion of the
|
||||
toeplitz matrix, which is really slow"""
|
||||
if signal.ndim > 1:
|
||||
raise ValueError("Array of rank > 1 not supported yet")
|
||||
if order > signal.size:
|
||||
raise ValueError("Input signal must have a lenght >= lpc order")
|
||||
|
||||
if order > 0:
|
||||
p = order + 1
|
||||
r = np.zeros(p, 'float32')
|
||||
# Number of non zero values in autocorrelation one needs for p LPC
|
||||
# coefficients
|
||||
nx = np.min([p, signal.size])
|
||||
x = np.correlate(signal, signal, 'full')
|
||||
r[:nx] = x[signal.size - 1:signal.size + order]
|
||||
phi = np.dot(sp.linalg.inv(sp.linalg.toeplitz(r[:-1])), -r[1:])
|
||||
return np.concatenate(([1.], phi))
|
||||
else:
|
||||
return np.ones(1, dtype='float32')
|
||||
|
||||
|
||||
# @jit
|
||||
def levinson_1d(r, order):
|
||||
"""Levinson-Durbin recursion, to efficiently solve symmetric linear systems
|
||||
with toeplitz structure.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
r : array-like
|
||||
input array to invert (since the matrix is symmetric Toeplitz, the
|
||||
corresponding pxp matrix is defined by p items only). Generally the
|
||||
autocorrelation of the signal for linear prediction coefficients
|
||||
estimation. The first item must be a non zero real.
|
||||
|
||||
Notes
|
||||
----
|
||||
This implementation is in python, hence unsuitable for any serious
|
||||
computation. Use it as educational and reference purpose only.
|
||||
|
||||
Levinson is a well-known algorithm to solve the Hermitian toeplitz
|
||||
equation:
|
||||
|
||||
_ _
|
||||
-R[1] = R[0] R[1] ... R[p-1] a[1]
|
||||
: : : : * :
|
||||
: : : _ * :
|
||||
-R[p] = R[p-1] R[p-2] ... R[0] a[p]
|
||||
_
|
||||
with respect to a ( is the complex conjugate). Using the special symmetry
|
||||
in the matrix, the inversion can be done in O(p^2) instead of O(p^3).
|
||||
"""
|
||||
r = np.atleast_1d(r)
|
||||
if r.ndim > 1:
|
||||
raise ValueError("Only rank 1 are supported for now.")
|
||||
|
||||
n = r.size
|
||||
if n < 1:
|
||||
raise ValueError("Cannot operate on empty array !")
|
||||
elif order > n - 1:
|
||||
raise ValueError("Order should be <= size-1")
|
||||
|
||||
if not np.isreal(r[0]):
|
||||
raise ValueError("First item of input must be real.")
|
||||
elif not np.isfinite(1 / r[0]):
|
||||
raise ValueError("First item should be != 0")
|
||||
|
||||
# Estimated coefficients
|
||||
a = np.empty(order + 1, 'float32')
|
||||
# temporary array
|
||||
t = np.empty(order + 1, 'float32')
|
||||
# Reflection coefficients
|
||||
k = np.empty(order, 'float32')
|
||||
|
||||
a[0] = 1.
|
||||
e = r[0]
|
||||
|
||||
for i in range(1, order + 1):
|
||||
acc = r[i]
|
||||
for j in range(1, i):
|
||||
acc += a[j] * r[i - j]
|
||||
k[i - 1] = -acc / e
|
||||
a[i] = k[i - 1]
|
||||
|
||||
for j in range(order):
|
||||
t[j] = a[j]
|
||||
|
||||
for j in range(1, i):
|
||||
a[j] += k[i - 1] * np.conj(t[i - j])
|
||||
|
||||
e *= 1 - k[i - 1] * np.conj(k[i - 1])
|
||||
|
||||
return a, e, k
|
||||
|
||||
|
||||
# @jit
|
||||
def _acorr_last_axis(x, nfft, maxlag):
|
||||
a = np.real(ifft(np.abs(fft(x, n=nfft) ** 2)))
|
||||
return a[..., :maxlag + 1] / x.shape[-1]
|
||||
|
||||
|
||||
# @jit
|
||||
def acorr_lpc(x, axis=-1):
|
||||
"""Compute autocorrelation of x along the given axis.
|
||||
|
||||
This compute the biased autocorrelation estimator (divided by the size of
|
||||
input signal)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The reason why we do not use acorr directly is for speed issue."""
|
||||
if not np.isrealobj(x):
|
||||
raise ValueError("Complex input not supported yet")
|
||||
|
||||
maxlag = x.shape[axis]
|
||||
nfft = int(2 ** next_pow_2(2 * maxlag - 1))
|
||||
|
||||
if axis != -1:
|
||||
x = np.swapaxes(x, -1, axis)
|
||||
a = _acorr_last_axis(x, nfft, maxlag)
|
||||
if axis != -1:
|
||||
a = np.swapaxes(a, -1, axis)
|
||||
return a
|
||||
|
||||
|
||||
# @jit
|
||||
def lpc(signal, order, axis=-1):
|
||||
"""Compute the Linear Prediction Coefficients.
|
||||
|
||||
Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will
|
||||
find the k+1 coefficients of a k order linear filter:
|
||||
|
||||
xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1]
|
||||
|
||||
Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
signal: array_like
|
||||
input signal
|
||||
order : int
|
||||
LPC order (the output will have order + 1 items)
|
||||
|
||||
Returns
|
||||
-------
|
||||
a : array-like
|
||||
the solution of the inversion.
|
||||
e : array-like
|
||||
the prediction error.
|
||||
k : array-like
|
||||
reflection coefficients.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This uses Levinson-Durbin recursion for the autocorrelation matrix
|
||||
inversion, and fft for the autocorrelation computation.
|
||||
|
||||
For small order, particularly if order << signal size, direct computation
|
||||
of the autocorrelation is faster: use levinson and correlate in this case."""
|
||||
n = signal.shape[axis]
|
||||
if order > n:
|
||||
raise ValueError("Input signal must have length >= order")
|
||||
|
||||
r = acorr_lpc(signal, axis)
|
||||
return levinson_1d(r, order)
|
||||
|
||||
|
||||
def process_frame(X, window, num_formants, new_sr):
|
||||
X = X * window
|
||||
A, e, k = lpc(X, num_formants * 2)
|
||||
|
||||
rts = np.roots(A)
|
||||
rts = rts[np.where(np.imag(rts) >= 0)]
|
||||
angz = np.arctan2(np.imag(rts), np.real(rts))
|
||||
frqs = angz * (new_sr / (2 * np.pi))
|
||||
frq_inds = np.argsort(frqs)
|
||||
frqs = frqs[frq_inds]
|
||||
bw = -1 / 2 * (new_sr / (2 * np.pi)) * np.log(np.abs(rts[frq_inds]))
|
||||
return frqs, bw
|
||||
|
||||
|
||||
def lpc_formants(signal, sr, num_formants, max_freq, time_step,
|
||||
win_len, window_shape='gaussian'):
|
||||
output = {}
|
||||
new_sr = 2 * max_freq
|
||||
alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr))
|
||||
proc = lfilter([1., -alpha], 1, signal)
|
||||
if sr > new_sr:
|
||||
proc = librosa.resample(proc, sr, new_sr)
|
||||
nperseg = int(win_len * new_sr)
|
||||
nperstep = int(time_step * new_sr)
|
||||
if window_shape == 'gaussian':
|
||||
window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1]
|
||||
else:
|
||||
window = np.hanning(nperseg + 2)[1:nperseg + 1]
|
||||
indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep)
|
||||
num_frames = len(indices)
|
||||
for i in range(num_frames):
|
||||
if nperseg % 2 != 0:
|
||||
X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1]
|
||||
else:
|
||||
X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)]
|
||||
frqs, bw = process_frame(X, window, num_formants, new_sr)
|
||||
formants = []
|
||||
for j, f in enumerate(frqs):
|
||||
if f < 50:
|
||||
continue
|
||||
if f > max_freq - 50:
|
||||
continue
|
||||
formants.append((np.asscalar(f), np.asscalar(bw[j])))
|
||||
missing = num_formants - len(formants)
|
||||
if missing:
|
||||
formants += [(None, None)] * missing
|
||||
output[indices[i] / new_sr] = formants
|
||||
return output
|
||||
|
||||
|
||||
#class FormantTrackFunction(BaseAnalysisFunction):
|
||||
# def __init__(self, num_formants=5, max_frequency=5000,
|
||||
# time_step=0.01, window_length=0.025, window_shape='gaussian'):
|
||||
# super(FormantTrackFunction, self).__init__()
|
||||
# self.arguments = [num_formants, max_frequency, time_step, window_length, window_shape]
|
||||
# self._function = lpc_formants
|
||||
# self.requires_file = False
|
||||
+1
-10
@@ -25,8 +25,6 @@ import wave
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
from isort import file
|
||||
|
||||
|
||||
def csv_append_row(tmp_preds, preds_filename, with_headers=True):
|
||||
|
||||
@@ -57,9 +55,7 @@ def csv_append_row(tmp_preds, preds_filename, with_headers=True):
|
||||
|
||||
|
||||
def generate_tmp_filename(extension):
|
||||
if not os.path.isdir('temp'):
|
||||
os.mkdir('temp')
|
||||
return "temp/" + next(tempfile._get_candidate_names()) + "." + extension
|
||||
return tempfile._get_default_tempdir() + "/" + next(tempfile._get_candidate_names()) + "." + extension
|
||||
|
||||
|
||||
def logging_defaults(logging_level="INFO"):
|
||||
@@ -173,8 +169,3 @@ def is_valid_wav(filename):
|
||||
or wav_file.getcomptype() != 'NONE':
|
||||
return False
|
||||
return True
|
||||
|
||||
def delete_temp_files():
|
||||
print("Clearing temp files...")
|
||||
for filename in os.listdir("temp"):
|
||||
os.remove("temp/" + filename)
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class LambdaBase(nn.Sequential):
|
||||
def __init__(self, fn, *args):
|
||||
super(LambdaBase, self).__init__(*args)
|
||||
self.lambda_func = fn
|
||||
|
||||
def forward_prepare(self, input):
|
||||
output = []
|
||||
for module in self._modules.values():
|
||||
output.append(module(input))
|
||||
return output if output else input
|
||||
|
||||
|
||||
class Lambda(LambdaBase):
|
||||
def forward(self, input):
|
||||
return self.lambda_func(self.forward_prepare(input))
|
||||
|
||||
|
||||
class LambdaMap(LambdaBase):
|
||||
def forward(self, input):
|
||||
return list(map(self.lambda_func, self.forward_prepare(input)))
|
||||
|
||||
|
||||
class LambdaReduce(LambdaBase):
|
||||
def forward(self, input):
|
||||
return reduce(self.lambda_func, self.forward_prepare(input))
|
||||
|
||||
|
||||
def load_estimation_model(inputfilename, outputfilename, begin, end, csv_export=True):
|
||||
with open(inputfilename, "r") as rf:
|
||||
contents = rf.read()
|
||||
contents = contents.split(",")
|
||||
|
||||
data = torch.Tensor(1, 350)
|
||||
name = ""
|
||||
for i in range(len(contents)):
|
||||
if i == 0:
|
||||
name = contents[i].strip()
|
||||
else:
|
||||
val = float(contents[i].strip())
|
||||
data[0][i - 1] = val
|
||||
|
||||
model = nn.Sequential(
|
||||
nn.Sequential(Lambda(lambda x: x.view(1, -1) if 1 == len(x.size()) else x), nn.Linear(350, 1024)),
|
||||
nn.Sigmoid(),
|
||||
nn.Sequential(Lambda(lambda x: x.view(1, -1) if 1 == len(x.size()) else x), nn.Linear(1024, 512)),
|
||||
nn.Sigmoid(),
|
||||
nn.Sequential(Lambda(lambda x: x.view(1, -1) if 1 == len(x.size()) else x), nn.Linear(512, 256)),
|
||||
nn.Sigmoid(),
|
||||
nn.Sequential(Lambda(lambda x: x.view(1, -1) if 1 == len(x.size()) else x), nn.Linear(256, 4)),
|
||||
)
|
||||
|
||||
model.load_state_dict(torch.load("em.pth"))
|
||||
my_prediction = model.forward(data)
|
||||
|
||||
prediction_dict = {}
|
||||
prediction_dict["F1"] = 1000 * float(my_prediction[0][0])
|
||||
prediction_dict["F2"] = 1000 * float(my_prediction[0][1])
|
||||
prediction_dict["F3"] = 1000 * float(my_prediction[0][2])
|
||||
prediction_dict["F4"] = 1000 * float(my_prediction[0][3])
|
||||
|
||||
if csv_export:
|
||||
with open(outputfilename, "w") as wf:
|
||||
wf.write("NAME,begin,end,F1,F2,F3,F4\n")
|
||||
wf.write(name + "," + str(begin) + "," + str(end) + "," + \
|
||||
str(prediction_dict["F1"]) + "," + str(prediction_dict["F2"]) + "," + \
|
||||
str(prediction_dict["F3"]) + "," + str(prediction_dict["F4"]) + "\n")
|
||||
|
||||
return prediction_dict
|
||||
@@ -1,13 +0,0 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from inaSpeechSegmenter import tf_mfcc
|
||||
|
||||
from formants import predict_from_times
|
||||
|
||||
if __name__ == '__main__':
|
||||
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/opt/cuda'
|
||||
# predict_from_times('data/VT 150hz baseline example.mp3', 'data/VT Predictions.csv', 0, 1)
|
||||
# tf_mfcc.power_spectrum(np.zeros(1024, dtype=np.int16), 1024, 512)
|
||||
predict_from_times('data/Example-f32le.wav', 'data/Example-F32-Predictions.csv', 0, 1)
|
||||
# predict_from_times('data/Example.wav', 'data/Example-Predictions.csv', 0, 1)
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user