Compare commits
130 Commits
webui-v1.1
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 63de471300 | |||
| 7885045775 | |||
| 8fa361c83b | |||
| 58e2cf78e6 | |||
| 61f86358ac | |||
| 7c84219238 | |||
| a3e0bc1a82 | |||
| d9345d73fa | |||
| 96de086509 | |||
| bcce0787b3 | |||
| aa3b066668 | |||
| 1f102f3c09 | |||
| 3a19b67247 | |||
| 1c99b4120b | |||
| 739105573d | |||
| 8b5acbc877 | |||
| e7dd856db8 | |||
| e3a941bad2 | |||
| 7946460e82 | |||
| 4ab49d946a | |||
| d2b7db3687 | |||
| bd90394d1d | |||
| 3cc4ecedd5 | |||
| e84523e572 | |||
| becf5145e5 | |||
| e54309d74d | |||
| 8a9535c0c7 | |||
| 2262cb8212 | |||
| 496dd8486a | |||
| e9441dbc7f | |||
| e01c21e65f | |||
| 758761ff99 | |||
| cad389b2ee | |||
| 46d68bcf8e | |||
| 2006097768 | |||
| 963089cc90 | |||
| 2e116a44ab | |||
| a5a0fed4e1 | |||
| cec3206028 | |||
| ccaa1db0e3 | |||
| 0cdb8554b5 | |||
| 1ba65b1b55 | |||
| f2f877762a | |||
| b888c11b33 | |||
| ac21cd274e | |||
| 86d23945b6 | |||
| 460222c845 | |||
| 7abb84e8b6 | |||
| 3938f0581b | |||
| 3ba2c740f9 | |||
| 7c9bda5ec9 | |||
| 5c8d5aa943 | |||
| aa6feb1178 | |||
| b31e0a414c | |||
| 2ca4200641 | |||
| 18a3d273e8 | |||
| a24999f41b | |||
| 809232f8d8 | |||
| 9cd6f36696 | |||
| a9b43a8afc | |||
| 6d65db1f76 | |||
| 576424fe58 | |||
| e97b185188 | |||
| 76c9cb239d | |||
| cb1f29d1ed | |||
| 9bbc9e9246 | |||
| 3b8d7b5ef4 | |||
| 631f97eff7 | |||
| 291d8ddf5e | |||
| 1d7e8fc637 | |||
| 9398660323 | |||
| 889433e6f4 | |||
| 6959ddcfc2 | |||
| 34c5d91a85 | |||
| bf560f2a68 | |||
| f1a6b82feb | |||
| 1b2c9b9631 | |||
| 83a6042731 | |||
| 3ae932ddaf | |||
| f44da7617f | |||
| 9459e87253 | |||
| 7a0c67c8c3 | |||
| 6b808d4a31 | |||
| 6444839ec0 | |||
| 8a4fdd263a | |||
| 0fe10b449e | |||
| f8b398f587 | |||
| 3d7e4220d4 | |||
| 2612e5dbcc | |||
| eb7eb8a022 | |||
| 05dbf649a1 | |||
| e33f8919d0 | |||
| 8e1893daf7 | |||
| b3e7ad0e50 | |||
| b35f4bc727 | |||
| 2f9f7c4b31 | |||
| 8160bd71d0 | |||
| 2c1276b8d9 | |||
| 1f19649e92 | |||
| 0f1fc8cb99 | |||
| 4f316f2f64 | |||
| c90fb9f63c | |||
| 02e3fd0a09 | |||
| 41173bfec7 | |||
| c4ab2501e4 | |||
| ff4078c098 | |||
| 7b4273f514 | |||
| cbadc7c0db | |||
| 56972cc455 | |||
| 94a36f612f | |||
| 0630c258ea | |||
| d79aadb786 | |||
| aa01f7b73e | |||
| eff14d53e5 | |||
| d06efc5fe7 | |||
| e7f0574fc8 | |||
| 5aea258e26 | |||
| ea491a457f | |||
| 5093ba0b9a | |||
| bbe2638855 | |||
| 8a04e3e824 | |||
| fbe46caa3d | |||
| 1fa2fa8642 | |||
| 1698f0dd8a | |||
| 11b2ea0f88 | |||
| 4a17d6580c | |||
| 4c45e6a74c | |||
| de2a885ab2 | |||
| f45e33c1ad | |||
| 15c4db56ba |
+162
@@ -0,0 +1,162 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
Generated
-3
@@ -1,3 +0,0 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
Generated
-12
@@ -1,12 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.7 (VITS)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
||||
-154
@@ -1,154 +0,0 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="132">
|
||||
<item index="0" class="java.lang.String" itemvalue="ccxt" />
|
||||
<item index="1" class="java.lang.String" itemvalue="lz4" />
|
||||
<item index="2" class="java.lang.String" itemvalue="pre-commit" />
|
||||
<item index="3" class="java.lang.String" itemvalue="elegantrl" />
|
||||
<item index="4" class="java.lang.String" itemvalue="setuptools" />
|
||||
<item index="5" class="java.lang.String" itemvalue="ray" />
|
||||
<item index="6" class="java.lang.String" itemvalue="gputil" />
|
||||
<item index="7" class="java.lang.String" itemvalue="google-pasta" />
|
||||
<item index="8" class="java.lang.String" itemvalue="tensorflow-estimator" />
|
||||
<item index="9" class="java.lang.String" itemvalue="scikit-learn" />
|
||||
<item index="10" class="java.lang.String" itemvalue="tabulate" />
|
||||
<item index="11" class="java.lang.String" itemvalue="multitasking" />
|
||||
<item index="12" class="java.lang.String" itemvalue="pickleshare" />
|
||||
<item index="13" class="java.lang.String" itemvalue="pyasn1-modules" />
|
||||
<item index="14" class="java.lang.String" itemvalue="ipython-genutils" />
|
||||
<item index="15" class="java.lang.String" itemvalue="Pygments" />
|
||||
<item index="16" class="java.lang.String" itemvalue="mccabe" />
|
||||
<item index="17" class="java.lang.String" itemvalue="astunparse" />
|
||||
<item index="18" class="java.lang.String" itemvalue="lxml" />
|
||||
<item index="19" class="java.lang.String" itemvalue="Werkzeug" />
|
||||
<item index="20" class="java.lang.String" itemvalue="tensorboard-data-server" />
|
||||
<item index="21" class="java.lang.String" itemvalue="jupyter-client" />
|
||||
<item index="22" class="java.lang.String" itemvalue="pexpect" />
|
||||
<item index="23" class="java.lang.String" itemvalue="click" />
|
||||
<item index="24" class="java.lang.String" itemvalue="ipykernel" />
|
||||
<item index="25" class="java.lang.String" itemvalue="pandas-datareader" />
|
||||
<item index="26" class="java.lang.String" itemvalue="psutil" />
|
||||
<item index="27" class="java.lang.String" itemvalue="jedi" />
|
||||
<item index="28" class="java.lang.String" itemvalue="regex" />
|
||||
<item index="29" class="java.lang.String" itemvalue="tensorboard" />
|
||||
<item index="30" class="java.lang.String" itemvalue="platformdirs" />
|
||||
<item index="31" class="java.lang.String" itemvalue="matplotlib" />
|
||||
<item index="32" class="java.lang.String" itemvalue="idna" />
|
||||
<item index="33" class="java.lang.String" itemvalue="rsa" />
|
||||
<item index="34" class="java.lang.String" itemvalue="decorator" />
|
||||
<item index="35" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="36" class="java.lang.String" itemvalue="pyasn1" />
|
||||
<item index="37" class="java.lang.String" itemvalue="requests" />
|
||||
<item index="38" class="java.lang.String" itemvalue="tensorflow" />
|
||||
<item index="39" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
|
||||
<item index="40" class="java.lang.String" itemvalue="Deprecated" />
|
||||
<item index="41" class="java.lang.String" itemvalue="nest-asyncio" />
|
||||
<item index="42" class="java.lang.String" itemvalue="prompt-toolkit" />
|
||||
<item index="43" class="java.lang.String" itemvalue="keras-tuner" />
|
||||
<item index="44" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="45" class="java.lang.String" itemvalue="dataclasses" />
|
||||
<item index="46" class="java.lang.String" itemvalue="tornado" />
|
||||
<item index="47" class="java.lang.String" itemvalue="google-auth-oauthlib" />
|
||||
<item index="48" class="java.lang.String" itemvalue="black" />
|
||||
<item index="49" class="java.lang.String" itemvalue="toml" />
|
||||
<item index="50" class="java.lang.String" itemvalue="Quandl" />
|
||||
<item index="51" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="52" class="java.lang.String" itemvalue="termcolor" />
|
||||
<item index="53" class="java.lang.String" itemvalue="pylint" />
|
||||
<item index="54" class="java.lang.String" itemvalue="typing_extensions" />
|
||||
<item index="55" class="java.lang.String" itemvalue="cachetools" />
|
||||
<item index="56" class="java.lang.String" itemvalue="debugpy" />
|
||||
<item index="57" class="java.lang.String" itemvalue="isort" />
|
||||
<item index="58" class="java.lang.String" itemvalue="pytz" />
|
||||
<item index="59" class="java.lang.String" itemvalue="inflection" />
|
||||
<item index="60" class="java.lang.String" itemvalue="Pillow" />
|
||||
<item index="61" class="java.lang.String" itemvalue="traitlets" />
|
||||
<item index="62" class="java.lang.String" itemvalue="absl-py" />
|
||||
<item index="63" class="java.lang.String" itemvalue="protobuf" />
|
||||
<item index="64" class="java.lang.String" itemvalue="joblib" />
|
||||
<item index="65" class="java.lang.String" itemvalue="threadpoolctl" />
|
||||
<item index="66" class="java.lang.String" itemvalue="opt-einsum" />
|
||||
<item index="67" class="java.lang.String" itemvalue="python-dateutil" />
|
||||
<item index="68" class="java.lang.String" itemvalue="gpflow" />
|
||||
<item index="69" class="java.lang.String" itemvalue="astroid" />
|
||||
<item index="70" class="java.lang.String" itemvalue="cycler" />
|
||||
<item index="71" class="java.lang.String" itemvalue="gast" />
|
||||
<item index="72" class="java.lang.String" itemvalue="kt-legacy" />
|
||||
<item index="73" class="java.lang.String" itemvalue="appdirs" />
|
||||
<item index="74" class="java.lang.String" itemvalue="tensorflow-probability" />
|
||||
<item index="75" class="java.lang.String" itemvalue="pip" />
|
||||
<item index="76" class="java.lang.String" itemvalue="pyzmq" />
|
||||
<item index="77" class="java.lang.String" itemvalue="certifi" />
|
||||
<item index="78" class="java.lang.String" itemvalue="oauthlib" />
|
||||
<item index="79" class="java.lang.String" itemvalue="pyparsing" />
|
||||
<item index="80" class="java.lang.String" itemvalue="Markdown" />
|
||||
<item index="81" class="java.lang.String" itemvalue="h5py" />
|
||||
<item index="82" class="java.lang.String" itemvalue="wrapt" />
|
||||
<item index="83" class="java.lang.String" itemvalue="kiwisolver" />
|
||||
<item index="84" class="java.lang.String" itemvalue="empyrical" />
|
||||
<item index="85" class="java.lang.String" itemvalue="backcall" />
|
||||
<item index="86" class="java.lang.String" itemvalue="charset-normalizer" />
|
||||
<item index="87" class="java.lang.String" itemvalue="multipledispatch" />
|
||||
<item index="88" class="java.lang.String" itemvalue="pathspec" />
|
||||
<item index="89" class="java.lang.String" itemvalue="jupyter-core" />
|
||||
<item index="90" class="java.lang.String" itemvalue="matplotlib-inline" />
|
||||
<item index="91" class="java.lang.String" itemvalue="ptyprocess" />
|
||||
<item index="92" class="java.lang.String" itemvalue="more-itertools" />
|
||||
<item index="93" class="java.lang.String" itemvalue="mypy-extensions" />
|
||||
<item index="94" class="java.lang.String" itemvalue="cloudpickle" />
|
||||
<item index="95" class="java.lang.String" itemvalue="wcwidth" />
|
||||
<item index="96" class="java.lang.String" itemvalue="requests-oauthlib" />
|
||||
<item index="97" class="java.lang.String" itemvalue="Keras-Preprocessing" />
|
||||
<item index="98" class="java.lang.String" itemvalue="yfinance" />
|
||||
<item index="99" class="java.lang.String" itemvalue="tomli" />
|
||||
<item index="100" class="java.lang.String" itemvalue="urllib3" />
|
||||
<item index="101" class="java.lang.String" itemvalue="six" />
|
||||
<item index="102" class="java.lang.String" itemvalue="parso" />
|
||||
<item index="103" class="java.lang.String" itemvalue="wheel" />
|
||||
<item index="104" class="java.lang.String" itemvalue="ipython" />
|
||||
<item index="105" class="java.lang.String" itemvalue="packaging" />
|
||||
<item index="106" class="java.lang.String" itemvalue="lazy-object-proxy" />
|
||||
<item index="107" class="java.lang.String" itemvalue="grpcio" />
|
||||
<item index="108" class="java.lang.String" itemvalue="dm-tree" />
|
||||
<item index="109" class="java.lang.String" itemvalue="google-auth" />
|
||||
<item index="110" class="java.lang.String" itemvalue="seaborn" />
|
||||
<item index="111" class="java.lang.String" itemvalue="thop" />
|
||||
<item index="112" class="java.lang.String" itemvalue="torch" />
|
||||
<item index="113" class="java.lang.String" itemvalue="torchvision" />
|
||||
<item index="114" class="java.lang.String" itemvalue="d2l" />
|
||||
<item index="115" class="java.lang.String" itemvalue="keyboard" />
|
||||
<item index="116" class="java.lang.String" itemvalue="transformers" />
|
||||
<item index="117" class="java.lang.String" itemvalue="phonemizer" />
|
||||
<item index="118" class="java.lang.String" itemvalue="Unidecode" />
|
||||
<item index="119" class="java.lang.String" itemvalue="nltk" />
|
||||
<item index="120" class="java.lang.String" itemvalue="pinecone-client" />
|
||||
<item index="121" class="java.lang.String" itemvalue="sentence-transformers" />
|
||||
<item index="122" class="java.lang.String" itemvalue="whisper" />
|
||||
<item index="123" class="java.lang.String" itemvalue="datasets" />
|
||||
<item index="124" class="java.lang.String" itemvalue="pyaudio" />
|
||||
<item index="125" class="java.lang.String" itemvalue="torchsummary" />
|
||||
<item index="126" class="java.lang.String" itemvalue="openjtalk" />
|
||||
<item index="127" class="java.lang.String" itemvalue="hydra-core" />
|
||||
<item index="128" class="java.lang.String" itemvalue="museval" />
|
||||
<item index="129" class="java.lang.String" itemvalue="mypy" />
|
||||
<item index="130" class="java.lang.String" itemvalue="hydra-colorlog" />
|
||||
<item index="131" class="java.lang.String" itemvalue="flake8" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="sentiment_classification.model_predictions.audio_path" />
|
||||
<option value="sentiment_classification.model_predictions.sample_rate" />
|
||||
<option value="sentiment_classification.model_predictions.num_samples" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
-6
@@ -1,6 +0,0 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
Generated
-4
@@ -1,4 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (VITS)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
Generated
-8
@@ -1,8 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" filepath="$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
Generated
-6
@@ -1,6 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
@@ -25,10 +25,10 @@ Your-zip-file.zip
|
||||
质量要求:2秒以上,10秒以内,尽量不要有背景噪音。
|
||||
数量要求:一个角色至少10条,最好每个角色20条以上。
|
||||
2. 以角色名命名的长音频文件,音频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.wav`
|
||||
(例如:`Diana_234135.wav`, `MinatoAqua_234252.wav`),必须是`.wav`文件。
|
||||
(例如:`Diana_234135.wav`, `MinatoAqua_234252.wav`),必须是`.wav`文件,长度要在20分钟以内(否则会内存不足)。
|
||||
|
||||
3. 以角色名命名的长视频文件,视频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.mp4`
|
||||
(例如:`Taffy_332452.mp4`, `Dingzhen_957315.mp4`),必须是`.mp4`文件。
|
||||
(例如:`Taffy_332452.mp4`, `Dingzhen_957315.mp4`),必须是`.mp4`文件,长度要在20分钟以内(否则会内存不足)。
|
||||
注意:命名中,`CharacterName`必须是英文字符,`random_number`是为了区分同一个角色的多个文件,必须要添加,该数字可以为0~999999之间的任意整数。
|
||||
|
||||
4. 包含多行`{CharacterName}|{video_url}`的`.txt`文件,格式应如下所示:
|
||||
@@ -39,4 +39,4 @@ Char2|https://xyz.com/video3/
|
||||
Char3|https://xyz.com/video4/
|
||||
```
|
||||
视频内只能有单说话人,背景音会被自动去除。目前仅支持来自bilibili的视频,其它网站视频的url还没测试过。
|
||||
若对格式有疑问,可以在[这里](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing)找到所有格式对应的数据样本。
|
||||
若对格式有疑问,可以在[这里](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing)找到所有格式对应的数据样本。
|
||||
|
||||
@@ -24,6 +24,7 @@ Your-zip-file.zip
|
||||
Note that the format of the audio files does not matter as long as they are audio files。
|
||||
Quality requirement: >=2s, <=10s, contain as little background sound as possible.
|
||||
Quantity requirement: at least 10 per character, 20+ per character is recommended.
|
||||
|
||||
2. Long audio files named by character names, which should contain single character voice only. Background sound is
|
||||
acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.wav`
|
||||
(E.G. `Diana_234135.wav`, `MinatoAqua_234252.wav`), must be `.wav` files.
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
# Train locally
|
||||
### Build environment
|
||||
0. Make sure you have installed `Python==3.8`, CMake & C/C++ compilers, ffmpeg;
|
||||
1. Clone this repository;
|
||||
2. Run `pip install -r requirements.txt`;
|
||||
3. Install GPU version PyTorch: (Make sure you have CUDA 11.6 or 11.7 installed)
|
||||
```
|
||||
# CUDA 11.6
|
||||
pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
# CUDA 11.7
|
||||
pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
|
||||
```
|
||||
4. Install necessary libraries for dealing video data:
|
||||
```
|
||||
pip install imageio==2.4.1
|
||||
pip install moviepy
|
||||
```
|
||||
5. Build monotonic align (necessary for training)
|
||||
```
|
||||
cd monotonic_align
|
||||
mkdir monotonic_align
|
||||
python setup.py build_ext --inplace
|
||||
cd ..
|
||||
```
|
||||
6. Download auxiliary data for training
|
||||
```
|
||||
mkdir pretrained_models
|
||||
# download data for fine-tuning
|
||||
wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/sampled_audio4ft_v2.zip
|
||||
unzip sampled_audio4ft_v2.zip
|
||||
# create necessary directories
|
||||
mkdir video_data
|
||||
mkdir raw_audio
|
||||
mkdir denoised_audio
|
||||
mkdir custom_character_voice
|
||||
mkdir segmented_character_voice
|
||||
```
|
||||
7. Download pretrained model, available options are:
|
||||
```
|
||||
CJE: Trilingual (Chinese, Japanese, English)
|
||||
CJ: Dualigual (Chinese, Japanese)
|
||||
C: Chinese only
|
||||
```
|
||||
### Linux
|
||||
To download `CJE` model, run the following:
|
||||
```
|
||||
wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/D_trilingual.pth -O ./pretrained_models/D_0.pth
|
||||
wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth -O ./pretrained_models/G_0.pth
|
||||
wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/configs/uma_trilingual.json -O ./configs/finetune_speaker.json
|
||||
```
|
||||
To download `CJ` model, run the following:
|
||||
```
|
||||
wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/D_0-p.pth -O ./pretrained_models/D_0.pth
|
||||
wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/G_0-p.pth -O ./pretrained_models/G_0.pth
|
||||
wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/config.json -O ./configs/finetune_speaker.json
|
||||
```
|
||||
To download `C` model, run the follwoing:
|
||||
```
|
||||
wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/D_0.pth -O ./pretrained_models/D_0.pth
|
||||
wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/G_0.pth -O ./pretrained_models/G_0.pth
|
||||
wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/config.json -O ./configs/finetune_speaker.json
|
||||
```
|
||||
### Windows
|
||||
Manually download `G_0.pth`, `D_0.pth`, `finetune_speaker.json` from the URLs in one of the options described above.
|
||||
|
||||
Rename all `G` models to `G_0.pth`, `D` models to `D_0.pth`, config files (`.json`) to `finetune_speaker.json`.
|
||||
Put `G_0.pth`, `D_0.pth` under `pretrained_models` directory;
|
||||
Put `finetune_speaker.json` under `configs` directory
|
||||
|
||||
#### Please note that when you download one of them, the previous model will be overwritten.
|
||||
9. Put your voice data under corresponding directories, see [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD) for detailed different uploading options.
|
||||
### Short audios
|
||||
1. Prepare your data according to [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD) as a single `.zip` file;
|
||||
2. Put your file under directory `./custom_character_voice/`;
|
||||
3. run `unzip ./custom_character_voice/custom_character_voice.zip -d ./custom_character_voice/`
|
||||
|
||||
### Long audios
|
||||
1. Name your audio files according to [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD);
|
||||
2. Put your renamed audio files under directory `./raw_audio/`
|
||||
|
||||
### Videos
|
||||
1. Name your video files according to [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD);
|
||||
2. Put your renamed video files under directory `./video_data/`
|
||||
10. Process all audio data.
|
||||
```
|
||||
python scripts/video2audio.py
|
||||
python scripts/denoise_audio.py
|
||||
python scripts/long_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large
|
||||
python scripts/short_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large
|
||||
python scripts/resample.py
|
||||
```
|
||||
Replace `"{PRETRAINED_MODEL}"` with one of `{CJ, CJE, C}` according to your previous model choice.
|
||||
Make sure you have a minimum GPU memory of 12GB. If not, change the argument `whisper_size` to `medium` or `small`.
|
||||
|
||||
10. Process all text data.
|
||||
If you choose to add auxiliary data, run `python preprocess_v2.py --add_auxiliary_data True --languages "{PRETRAINED_MODEL}"`
|
||||
If not, run `python preprocess_v2.py --languages "{PRETRAINED_MODEL}"`
|
||||
Do replace `"{PRETRAINED_MODEL}"` with one of `{CJ, CJE, C}` according to your previous model choice.
|
||||
|
||||
11. Start Training.
|
||||
Run `python finetune_speaker_v2.py -m ./OUTPUT_MODEL --max_epochs "{Maximum_epochs}" --drop_speaker_embed True`
|
||||
Do replace `{Maximum_epochs}` with your desired number of epochs. Empirically, 100 or more is recommended.
|
||||
To continue training on previous checkpoint, change the training command to: `python finetune_speaker_v2.py -m ./OUTPUT_MODEL --max_epochs "{Maximum_epochs}" --drop_speaker_embed False --cont True`. Before you do this, make sure you have previous `G_latest.pth` and `D_latest.pth` under `./OUTPUT_MODEL/` directory.
|
||||
To view training progress, open a new terminal and `cd` to the project root directory, run `tensorboard --logdir=./OUTPUT_MODEL`, then visit `localhost:6006` with your web browser.
|
||||
|
||||
12. After training is completed, you can use your model by running:
|
||||
`python VC_inference.py --model_dir ./OUTPUT_MODEL/G_latest.pth --share True`
|
||||
13. To clear all audio data, run:
|
||||
### Linux
|
||||
```
|
||||
rm -rf ./custom_character_voice/* ./video_data/* ./raw_audio/* ./denoised_audio/* ./segmented_character_voice/* ./separated/* long_character_anno.txt short_character_anno.txt
|
||||
```
|
||||
### Windows
|
||||
```
|
||||
del /Q /S .\custom_character_voice\* .\video_data\* .\raw_audio\* .\denoised_audio\* .\segmented_character_voice\* .\separated\* long_character_anno.txt short_character_anno.txt
|
||||
```
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@ to make it able to do the following tasks in less than 1 hour:
|
||||
Welcome to play around with the base models!
|
||||
Chinese & English & Japanese:[](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) Author: Me
|
||||
|
||||
Chinese & Japanese:[](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) Author: [SayaSS](https://github.com/SayaSS)
|
||||
Chinese & Japanese:[](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) Author: [SayaSS](https://github.com/SayaSS)
|
||||
|
||||
Chinese only:(No running huggingface spaces) Author: [Wwwwhy230825](https://github.com/Wwwwhy230825)
|
||||
|
||||
|
||||
### Currently Supported Tasks:
|
||||
@@ -20,17 +22,15 @@ Chinese & Japanese:[
|
||||
|
||||
### Currently Supported Characters for TTS & VC:
|
||||
- [x] Umamusume Pretty Derby (Used as base model pretraining)
|
||||
- [x] Sanoba Witch (Used as base model pretraining)
|
||||
- [x] Genshin Impact (Used as base model pretraining)
|
||||
- [x] Any character you wish as long as you have their voices!
|
||||
|
||||
(Note that voice conversion can only be conducted between any two speakers in the model)
|
||||
|
||||
|
||||
|
||||
## Fine-tuning
|
||||
It's recommended to perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
|
||||
because the original VITS has some dependencies that are difficult to configure.
|
||||
See [LOCAL.md](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/LOCAL.md) for local training guide.
|
||||
Alternatively, you can perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
|
||||
|
||||
|
||||
### How long does it take?
|
||||
1. Install dependencies (3 min)
|
||||
@@ -52,7 +52,11 @@ inference
|
||||
└───G_latest.pth
|
||||
```
|
||||
4. run `inference.exe`, the browser should pop up automatically.
|
||||
5. Note: you must install `ffmpeg` to enable voice conversion feature.
|
||||
|
||||
## Use in MoeGoe
|
||||
0. Prepare downloaded model & config file, which are named `G_latest.pth` and `moegoe_config.json`, respectively.
|
||||
1. Follow [MoeGoe](https://github.com/CjangCjengh/MoeGoe) page instructions to install, configure path, and use.
|
||||
1. Follow [MoeGoe](https://github.com/CjangCjengh/MoeGoe) page instructions to install, configure path, and use.
|
||||
|
||||
## Looking for help?
|
||||
If you have any questions, please feel free to open an [issue](https://github.com/Plachtaa/VITS-fast-fine-tuning/issues/new) or join our [Discord](https://discord.gg/TcrjDFvm5A) server.
|
||||
|
||||
+16
-8
@@ -10,7 +10,9 @@ English Documentation Please Click [here](https://github.com/Plachtaa/VITS-fast-
|
||||
|
||||
中日英:[](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) 作者:我
|
||||
|
||||
中日:[](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) 作者:[SayaSS](https://github.com/SayaSS)
|
||||
中日:[](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) 作者:[SayaSS](https://github.com/SayaSS)
|
||||
|
||||
纯中文:(没有huggingface demo)作者:[Wwwwhy230825](https://github.com/Wwwwhy230825)
|
||||
|
||||
### 目前支持的任务:
|
||||
- [x] 从 10条以上的短音频 克隆角色声音
|
||||
@@ -19,19 +21,18 @@ English Documentation Please Click [here](https://github.com/Plachtaa/VITS-fast-
|
||||
- [x] 通过输入 bilibili视频链接(单个视频只能包含单说话人) 克隆角色声音
|
||||
|
||||
### 目前支持声线转换和中日英三语TTS的角色
|
||||
- [x] 赛马娘 (仅已实装角色)(预训练时使用的角色)
|
||||
- [x] 魔女的夜宴(柚子社) (5人)(预训练时使用的角色)
|
||||
- [x] 原神 (仅已实装角色)(预训练时使用的角色)
|
||||
- [x] 任意角色(只要你有角色的声音样本)
|
||||
(注意:声线转换只能在任意两个存在于模型中的说话人之间进行)
|
||||
|
||||
|
||||
|
||||
|
||||
## 微调
|
||||
建议使用 [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
|
||||
进行微调任务,因为VITS在多语言情况下的某些环境依赖相当难以配置。
|
||||
### 在Google Colab里,我需要花多长时间?
|
||||
1. 安装依赖 (3 min)
|
||||
若希望于本地机器进行训练,请参考[LOCAL.md](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/LOCAL.md)以进行。
|
||||
另外,也可以选择使用 [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
|
||||
进行微调任务。
|
||||
### 我需要花多长时间?
|
||||
1. 安装依赖 (10 min在Google Colab中)
|
||||
2. 选择预训练模型,详细区别参见[Colab 笔记本页面](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)。
|
||||
3. 上传你希望加入的其它角色声音,详细上传方式见[DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA.MD)
|
||||
4. 进行微调,根据选择的微调方式和样本数量不同,花费时长可能在20分钟到2小时不等。
|
||||
@@ -51,8 +52,15 @@ inference
|
||||
└───G_latest.pth
|
||||
```
|
||||
4. 运行 `inference.exe`, 浏览器会自动弹出窗口, 注意其所在路径不能有中文字符或者空格.
|
||||
5. 请注意,声线转换功能需要安装`ffmpeg`才能正常使用.
|
||||
|
||||
## 在MoeGoe使用
|
||||
0. MoeGoe以及类似其它VITS推理UI使用的config格式略有不同,需要下载的文件为模型`G_latest.pth`和配置文件`moegoe_config.json`
|
||||
1. 按照[MoeGoe](https://github.com/CjangCjengh/MoeGoe)页面的提示配置路径即可使用。
|
||||
2. MoeGoe在输入句子时需要使用相应的语言标记包裹句子才能正常合成。(日语用[JA], 中文用[ZH], 英文用[EN]),例如:
|
||||
[JA]こんにちわ。[JA]
|
||||
[ZH]你好![ZH]
|
||||
[EN]Hello![EN]
|
||||
|
||||
## 帮助
|
||||
如果你在使用过程中遇到了任何问题,可以在[这里](https://github.com/Plachtaa/VITS-fast-fine-tuning/issues/new)开一个issue,或者加入Discord服务器寻求帮助:[Discord](https://discord.gg/TcrjDFvm5A)。
|
||||
|
||||
+15
-3
@@ -9,10 +9,16 @@ import utils
|
||||
from models import SynthesizerTrn
|
||||
import gradio as gr
|
||||
import librosa
|
||||
import webbrowser
|
||||
|
||||
from text import text_to_sequence, _clean_text
|
||||
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
import logging
|
||||
logging.getLogger("PIL").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("asyncio").setLevel(logging.WARNING)
|
||||
|
||||
language_marks = {
|
||||
"Japanese": "",
|
||||
"日本語": "[JA]",
|
||||
@@ -21,6 +27,8 @@ language_marks = {
|
||||
"Mix": "",
|
||||
}
|
||||
lang = ['日本語', '简体中文', 'English', 'Mix']
|
||||
|
||||
|
||||
def get_text(text, hps, is_symbol):
|
||||
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
||||
if hps.data.add_blank:
|
||||
@@ -28,6 +36,7 @@ def get_text(text, hps, is_symbol):
|
||||
text_norm = LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
|
||||
def create_tts_fn(model, hps, speaker_ids):
|
||||
def tts_fn(text, speaker, language, speed):
|
||||
if language is not None:
|
||||
@@ -45,6 +54,7 @@ def create_tts_fn(model, hps, speaker_ids):
|
||||
|
||||
return tts_fn
|
||||
|
||||
|
||||
def create_vc_fn(model, hps, speaker_ids):
|
||||
def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
|
||||
input_audio = record_audio if record_audio is not None else upload_audio
|
||||
@@ -76,6 +86,8 @@ def create_vc_fn(model, hps, speaker_ids):
|
||||
return "Success", (hps.data.sampling_rate, audio)
|
||||
|
||||
return vc_fn
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
|
||||
@@ -121,12 +133,12 @@ if __name__ == "__main__":
|
||||
outputs=[text_output, audio_output])
|
||||
with gr.Tab("Voice Conversion"):
|
||||
gr.Markdown("""
|
||||
录制或上传声音,并选择要转换的音色。User代表的音色是你自己。
|
||||
录制或上传声音,并选择要转换的音色。
|
||||
""")
|
||||
with gr.Column():
|
||||
record_audio = gr.Audio(label="record your voice", source="microphone")
|
||||
upload_audio = gr.Audio(label="or upload audio here", source="upload")
|
||||
source_speaker = gr.Dropdown(choices=speakers, value="User", label="source speaker")
|
||||
source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
|
||||
target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
|
||||
with gr.Column():
|
||||
message_box = gr.Textbox(label="Message")
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
import argparse
|
||||
import io
|
||||
import json
|
||||
from json import JSONDecodeError
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
from hypy_utils.logging_utils import setup_logger
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
from torch import no_grad, LongTensor
|
||||
|
||||
import commons
|
||||
import utils
|
||||
from models import SynthesizerTrn
|
||||
from text import text_to_sequence
|
||||
|
||||
|
||||
log = setup_logger()
|
||||
|
||||
app = FastAPI()
|
||||
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
language_marks = {
|
||||
"日本語": "[JA]",
|
||||
"简体中文": "[ZH]",
|
||||
"English": "[EN]",
|
||||
"Mix": "",
|
||||
}
|
||||
|
||||
# Allow all CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
def get_text(text: str, is_symbol: bool):
|
||||
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
||||
if hps.data.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
|
||||
def tts_fn(text: str, speaker: str, language: str, speed: float):
|
||||
if language is not None:
|
||||
text = language_marks[language] + text + language_marks[language]
|
||||
speaker_id = speaker_ids[speaker]
|
||||
stn_tst = get_text(text, False)
|
||||
with no_grad():
|
||||
x_tst = stn_tst.unsqueeze(0).to(device)
|
||||
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
||||
sid = LongTensor([speaker_id]).to(device)
|
||||
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
||||
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
||||
del stn_tst, x_tst, x_tst_lengths, sid
|
||||
return audio
|
||||
|
||||
|
||||
@app.get("/tts/options")
|
||||
async def get_options():
|
||||
return {"speakers": list(speaker_ids.keys()), "languages": list(language_marks.keys())}
|
||||
|
||||
|
||||
@app.post("/tts")
|
||||
async def generate(request: Request):
|
||||
body = (await request.body()).decode()
|
||||
|
||||
# Try parse json
|
||||
if body.startswith('{'):
|
||||
try:
|
||||
data = json.loads(body)
|
||||
except JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail="Invalid JSON format")
|
||||
# Try parse x-www-form-urlencoded
|
||||
else:
|
||||
data = parse_qs(body)
|
||||
data = {k: v[0] for k, v in data.items()}
|
||||
|
||||
log.info(data)
|
||||
|
||||
text = data.get('text').strip().replace("\n", " ")
|
||||
speaker = data.get('speaker')
|
||||
language = data.get('language', '日本語')
|
||||
speed = data.get('speed', 1.0)
|
||||
|
||||
if len(text) > 200:
|
||||
raise HTTPException(status_code=400, detail="TL;DR")
|
||||
|
||||
if not text or not speaker or language not in language_marks:
|
||||
raise HTTPException(status_code=400, detail="Invalid speaker or language (please check /tts/options)")
|
||||
|
||||
audio = tts_fn(text, speaker, language, speed)
|
||||
audio_io = io.BytesIO()
|
||||
# sf.write(audio_io, audio, hps.data.sampling_rate, format='OGG')
|
||||
# Since safari don't support ogg, use mp3 instead
|
||||
sf.write(audio_io, audio, hps.data.sampling_rate, format='MP3')
|
||||
audio_io.seek(0)
|
||||
|
||||
return StreamingResponse(audio_io, media_type='audio/mpeg',
|
||||
headers={'Content-Disposition': 'attachment; filename="output.mp3"'})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-d", default="./OUTPUT_MODEL",
|
||||
help="directory to your fine-tuned model (contains G_latest.pth and config.json)")
|
||||
args = parser.parse_args()
|
||||
d_config = Path(args.d) / "config.json"
|
||||
d_model = Path(args.d) / "G_latest.pth"
|
||||
hps = utils.get_hparams_from_file(d_config)
|
||||
|
||||
model = SynthesizerTrn(
|
||||
len(hps.symbols),
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model).to(device)
|
||||
_ = model.eval()
|
||||
|
||||
utils.load_checkpoint(d_model, model, None)
|
||||
speaker_ids = hps.speakers
|
||||
|
||||
uvicorn.run(app, host='0.0.0.0', port=27519)
|
||||
@@ -0,0 +1,106 @@
|
||||
"""该模块用于生成VITS文件
|
||||
使用方法
|
||||
|
||||
python cmd_inference.py -m 模型路径 -c 配置文件路径 -o 输出文件路径 -l 输入的语言 -t 输入文本 -s 合成目标说话人名称
|
||||
|
||||
可选参数
|
||||
-ns 感情变化程度
|
||||
-nsw 音素发音长度
|
||||
-ls 整体语速
|
||||
-on 输出文件的名称
|
||||
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import utils
|
||||
from models import SynthesizerTrn
|
||||
import torch
|
||||
from torch import no_grad, LongTensor
|
||||
import librosa
|
||||
from text import text_to_sequence, _clean_text
|
||||
import commons
|
||||
import scipy.io.wavfile as wavf
|
||||
import os
|
||||
|
||||
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
language_marks = {
|
||||
"Japanese": "",
|
||||
"日本語": "[JA]",
|
||||
"简体中文": "[ZH]",
|
||||
"English": "[EN]",
|
||||
"Mix": "",
|
||||
}
|
||||
|
||||
|
||||
def get_text(text, hps, is_symbol):
|
||||
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
||||
if hps.data.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='vits inference')
|
||||
#必须参数
|
||||
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
|
||||
parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
|
||||
parser.add_argument('-o', '--output_path', type=str, default="output/vits", help='输出文件路径')
|
||||
parser.add_argument('-l', '--language', type=str, default="日本語", help='输入的语言')
|
||||
parser.add_argument('-t', '--text', type=str, help='输入文本')
|
||||
parser.add_argument('-s', '--spk', type=str, help='合成目标说话人名称')
|
||||
#可选参数
|
||||
parser.add_argument('-on', '--output_name', type=str, default="output", help='输出文件的名称')
|
||||
parser.add_argument('-ns', '--noise_scale', type=float,default= .667,help='感情变化程度')
|
||||
parser.add_argument('-nsw', '--noise_scale_w', type=float,default=0.6, help='音素发音长度')
|
||||
parser.add_argument('-ls', '--length_scale', type=float,default=1, help='整体语速')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = args.model_path
|
||||
config_path = args.config_path
|
||||
output_dir = Path(args.output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
language = args.language
|
||||
text = args.text
|
||||
spk = args.spk
|
||||
noise_scale = args.noise_scale
|
||||
noise_scale_w = args.noise_scale_w
|
||||
length = args.length_scale
|
||||
output_name = args.output_name
|
||||
|
||||
hps = utils.get_hparams_from_file(config_path)
|
||||
net_g = SynthesizerTrn(
|
||||
len(hps.symbols),
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model).to(device)
|
||||
_ = net_g.eval()
|
||||
_ = utils.load_checkpoint(model_path, net_g, None)
|
||||
|
||||
speaker_ids = hps.speakers
|
||||
|
||||
|
||||
if language is not None:
|
||||
text = language_marks[language] + text + language_marks[language]
|
||||
speaker_id = speaker_ids[spk]
|
||||
stn_tst = get_text(text, hps, False)
|
||||
with no_grad():
|
||||
x_tst = stn_tst.unsqueeze(0).to(device)
|
||||
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
||||
sid = LongTensor([speaker_id]).to(device)
|
||||
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
||||
length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
|
||||
del stn_tst, x_tst, x_tst_lengths, sid
|
||||
|
||||
wavf.write(str(output_dir)+"/"+output_name+".wav",hps.data.sampling_rate,audio)
|
||||
|
||||
|
||||
|
||||
|
||||
+14
-5
@@ -195,10 +195,19 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
if idx_bucket != -1:
|
||||
buckets[idx_bucket].append(i)
|
||||
|
||||
for i in range(len(buckets) - 1, 0, -1):
|
||||
if len(buckets[i]) == 0:
|
||||
buckets.pop(i)
|
||||
self.boundaries.pop(i + 1)
|
||||
try:
|
||||
for i in range(len(buckets) - 1, 0, -1):
|
||||
if len(buckets[i]) == 0:
|
||||
buckets.pop(i)
|
||||
self.boundaries.pop(i + 1)
|
||||
assert all(len(bucket) > 0 for bucket in buckets)
|
||||
# When one bucket is not traversed
|
||||
except Exception as e:
|
||||
print('Bucket warning ', e)
|
||||
for i in range(len(buckets) - 1, -1, -1):
|
||||
if len(buckets[i]) == 0:
|
||||
buckets.pop(i)
|
||||
self.boundaries.pop(i + 1)
|
||||
|
||||
num_samples_per_bucket = []
|
||||
for i in range(len(buckets)):
|
||||
@@ -264,4 +273,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
return -1
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples // self.batch_size
|
||||
return self.num_samples // self.batch_size
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
from google.colab import files
|
||||
import shutil
|
||||
import os
|
||||
basepath = os.getcwd()
|
||||
uploaded = files.upload() # 上传文件
|
||||
for filename in uploaded.keys():
|
||||
assert(filename.endswith(".txt")), "speaker-videolink info could only be .txt file!"
|
||||
shutil.move(os.path.join(basepath, filename), os.path.join("./speaker_links.txt"))
|
||||
|
||||
with open("./speaker_links.txt", 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
speakers = []
|
||||
for line in lines:
|
||||
line = line.replace("\n", "").replace(" ", "")
|
||||
if line == "":
|
||||
continue
|
||||
speaker, link = line.split("|")
|
||||
if speaker not in speakers:
|
||||
speakers.append(speaker)
|
||||
# download link
|
||||
import random
|
||||
filename = speaker + "_" + str(random.randint(0, 1000000))
|
||||
os.system(f"youtube-dl -f 0 {link} -o ./video_data/{filename}.mp4")
|
||||
+64
-13
@@ -98,10 +98,30 @@ def run(rank, n_gpus, hps):
|
||||
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
|
||||
|
||||
# load existing model
|
||||
_, _, _, _ = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None, drop_speaker_emb=hps.drop_speaker_embed)
|
||||
_, _, _, _ = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
|
||||
epoch_str = 1
|
||||
global_step = 0
|
||||
if hps.cont:
|
||||
try:
|
||||
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_latest.pth"), net_g, None)
|
||||
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_latest.pth"), net_d, None)
|
||||
global_step = (epoch_str - 1) * len(train_loader)
|
||||
except:
|
||||
print("Failed to find latest checkpoint, loading G_0.pth...")
|
||||
if hps.train_with_pretrained_model:
|
||||
print("Train with pretrained model...")
|
||||
_, _, _, epoch_str = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None)
|
||||
_, _, _, epoch_str = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
|
||||
else:
|
||||
print("Train without pretrained model...")
|
||||
epoch_str = 1
|
||||
global_step = 0
|
||||
else:
|
||||
if hps.train_with_pretrained_model:
|
||||
print("Train with pretrained model...")
|
||||
_, _, _, epoch_str = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None)
|
||||
_, _, _, epoch_str = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
|
||||
else:
|
||||
print("Train without pretrained model...")
|
||||
epoch_str = 1
|
||||
global_step = 0
|
||||
# freeze all other layers except speaker embedding
|
||||
for p in net_g.parameters():
|
||||
p.requires_grad = True
|
||||
@@ -240,16 +260,47 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
|
||||
if global_step % hps.train.eval_interval == 0:
|
||||
evaluate(hps, net_g, eval_loader, writer_eval)
|
||||
utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
|
||||
|
||||
utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
|
||||
os.path.join(hps.model_dir, "G_latest.pth".format(global_step)))
|
||||
# utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
|
||||
old_g=os.path.join(hps.model_dir, "G_{}.pth".format(global_step-4000))
|
||||
# old_d=os.path.join(hps.model_dir, "D_{}.pth".format(global_step-400))
|
||||
if os.path.exists(old_g):
|
||||
os.remove(old_g)
|
||||
# if os.path.exists(old_d):
|
||||
# os.remove(old_d)
|
||||
os.path.join(hps.model_dir, "G_latest.pth"))
|
||||
|
||||
utils.save_checkpoint(net_d, None, hps.train.learning_rate, epoch,
|
||||
os.path.join(hps.model_dir, "D_latest.pth"))
|
||||
# save to google drive
|
||||
if os.path.exists("/content/drive/MyDrive/"):
|
||||
utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
|
||||
os.path.join("/content/drive/MyDrive/", "G_latest.pth"))
|
||||
|
||||
utils.save_checkpoint(net_d, None, hps.train.learning_rate, epoch,
|
||||
os.path.join("/content/drive/MyDrive/", "D_latest.pth"))
|
||||
if hps.preserved > 0:
|
||||
utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
|
||||
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
|
||||
utils.save_checkpoint(net_d, None, hps.train.learning_rate, epoch,
|
||||
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
|
||||
old_g = utils.oldest_checkpoint_path(hps.model_dir, "G_[0-9]*.pth",
|
||||
preserved=hps.preserved) # Preserve 4 (default) historical checkpoints.
|
||||
old_d = utils.oldest_checkpoint_path(hps.model_dir, "D_[0-9]*.pth", preserved=hps.preserved)
|
||||
if os.path.exists(old_g):
|
||||
print(f"remove {old_g}")
|
||||
os.remove(old_g)
|
||||
if os.path.exists(old_d):
|
||||
print(f"remove {old_d}")
|
||||
os.remove(old_d)
|
||||
if os.path.exists("/content/drive/MyDrive/"):
|
||||
utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
|
||||
os.path.join("/content/drive/MyDrive/", "G_{}.pth".format(global_step)))
|
||||
utils.save_checkpoint(net_d, None, hps.train.learning_rate, epoch,
|
||||
os.path.join("/content/drive/MyDrive/", "D_{}.pth".format(global_step)))
|
||||
old_g = utils.oldest_checkpoint_path("/content/drive/MyDrive/", "G_[0-9]*.pth",
|
||||
preserved=hps.preserved) # Preserve 4 (default) historical checkpoints.
|
||||
old_d = utils.oldest_checkpoint_path("/content/drive/MyDrive/", "D_[0-9]*.pth", preserved=hps.preserved)
|
||||
if os.path.exists(old_g):
|
||||
print(f"remove {old_g}")
|
||||
os.remove(old_g)
|
||||
if os.path.exists(old_d):
|
||||
print(f"remove {old_d}")
|
||||
os.remove(old_d)
|
||||
global_step += 1
|
||||
if epoch > hps.max_epochs:
|
||||
print("Maximum epoch reached, closing training...")
|
||||
|
||||
+2
-2
@@ -64,7 +64,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
return spec
|
||||
@@ -102,7 +102,7 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y.float(), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
import copy
|
||||
import math
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
|
||||
import attentions
|
||||
import commons
|
||||
import modules
|
||||
import attentions
|
||||
import monotonic_align
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from commons import init_weights, get_padding
|
||||
|
||||
|
||||
@@ -386,7 +385,6 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
|
||||
-402
@@ -1,402 +0,0 @@
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
import modules
|
||||
import attentions
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from commons import init_weights, get_padding
|
||||
|
||||
|
||||
class StochasticDurationPredictor(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
|
||||
super().__init__()
|
||||
filter_channels = in_channels # it needs to be removed from future version.
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.log_flow = modules.Log()
|
||||
self.flows = nn.ModuleList()
|
||||
self.flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(n_flows):
|
||||
self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
||||
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
||||
self.post_flows = nn.ModuleList()
|
||||
self.post_flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(4):
|
||||
self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
||||
self.post_flows.append(modules.Flip())
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
||||
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
||||
x = torch.detach(x)
|
||||
x = self.pre(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.convs(x, x_mask)
|
||||
x = self.proj(x) * x_mask
|
||||
|
||||
if not reverse:
|
||||
flows = self.flows
|
||||
assert w is not None
|
||||
|
||||
logdet_tot_q = 0
|
||||
h_w = self.post_pre(w)
|
||||
h_w = self.post_convs(h_w, x_mask)
|
||||
h_w = self.post_proj(h_w) * x_mask
|
||||
e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
|
||||
z_q = e_q
|
||||
for flow in self.post_flows:
|
||||
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
||||
logdet_tot_q += logdet_q
|
||||
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
||||
u = torch.sigmoid(z_u) * x_mask
|
||||
z0 = (w - u) * x_mask
|
||||
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
|
||||
logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
|
||||
|
||||
logdet_tot = 0
|
||||
z0, logdet = self.log_flow(z0, x_mask)
|
||||
logdet_tot += logdet
|
||||
z = torch.cat([z0, z1], 1)
|
||||
for flow in flows:
|
||||
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
||||
logdet_tot = logdet_tot + logdet
|
||||
nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
|
||||
return nll + logq # [b]
|
||||
else:
|
||||
flows = list(reversed(self.flows))
|
||||
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
||||
z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
|
||||
for flow in flows:
|
||||
z = flow(z, x_mask, g=x, reverse=reverse)
|
||||
z0, z1 = torch.split(z, [1, 1], 1)
|
||||
logw = z0
|
||||
return logw
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
||||
super().__init__()
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
||||
self.norm_1 = modules.LayerNorm(filter_channels)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
||||
self.norm_2 = modules.LayerNorm(filter_channels)
|
||||
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
x = torch.detach(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.conv_1(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_1(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_2(x)
|
||||
x = self.drop(x)
|
||||
x = self.proj(x * x_mask)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout):
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
||||
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
||||
|
||||
self.encoder = attentions.Encoder(
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout)
|
||||
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths):
|
||||
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
||||
x = torch.transpose(x, 1, -1) # [b, h, t]
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
|
||||
x = self.encoder(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
return x, m, logs, x_mask
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
n_flows=4,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for i in range(n_flows):
|
||||
self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
if not reverse:
|
||||
for flow in self.flows:
|
||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||
else:
|
||||
for flow in reversed(self.flows):
|
||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||
return x
|
||||
|
||||
|
||||
class PosteriorEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
||||
super(Generator, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
self.num_upsamples = len(upsample_rates)
|
||||
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
||||
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||
self.ups.append(weight_norm(
|
||||
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
|
||||
k, u, padding=(k-u)//2)))
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = upsample_initial_channel//(2**(i+1))
|
||||
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(ch, k, d))
|
||||
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||
self.ups.apply(init_weights)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||
|
||||
def forward(self, x, g=None):
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
x = x + self.cond(g)
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i*self.num_kernels+j](x)
|
||||
else:
|
||||
xs += self.resblocks[i*self.num_kernels+j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
n_speakers=0,
|
||||
gin_channels=0,
|
||||
use_sdp=True,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.use_sdp = use_sdp
|
||||
|
||||
self.enc_p = TextEncoder(n_vocab,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout)
|
||||
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
||||
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
||||
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
||||
|
||||
if use_sdp:
|
||||
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
||||
else:
|
||||
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
||||
|
||||
if n_speakers > 1:
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
|
||||
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
||||
if self.n_speakers > 0:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
if self.use_sdp:
|
||||
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
||||
else:
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
w = torch.exp(logw) * x_mask * length_scale
|
||||
w_ceil = torch.ceil(w)
|
||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = commons.generate_path(w_ceil, attn_mask)
|
||||
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
||||
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
||||
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z, z_p, z_hat)
|
||||
|
||||
+1
-1
@@ -69,7 +69,7 @@ class ConvReluNorm(nn.Module):
|
||||
|
||||
class DDSConv(nn.Module):
|
||||
"""
|
||||
Dialted and Depth-Separable Convolution
|
||||
Dilated and Depth-Separable Convolution
|
||||
"""
|
||||
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
||||
super().__init__()
|
||||
|
||||
+44
-17
@@ -1,11 +1,20 @@
|
||||
import os
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
sys.setrecursionlimit(500000) # Fix the error message of RecursionError: maximum recursion depth exceeded while calling a Python object. You can change the number as you want.
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
|
||||
parser.add_argument("--languages", default="CJE")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.languages == "CJE":
|
||||
langs = ["[ZH]", "[JA]", "[EN]"]
|
||||
elif args.languages == "CJ":
|
||||
langs = ["[ZH]", "[JA]"]
|
||||
elif args.languages == "C":
|
||||
langs = ["[ZH]"]
|
||||
new_annos = []
|
||||
# Source 1: transcribed short audios
|
||||
if os.path.exists("short_character_anno.txt"):
|
||||
@@ -13,8 +22,8 @@ if __name__ == "__main__":
|
||||
short_character_anno = f.readlines()
|
||||
new_annos += short_character_anno
|
||||
# Source 2: transcribed long audio segments
|
||||
if os.path.exists("long_character_anno.txt"):
|
||||
with open("long_character_anno.txt", 'r', encoding='utf-8') as f:
|
||||
if os.path.exists("./long_character_anno.txt"):
|
||||
with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
|
||||
long_character_anno = f.readlines()
|
||||
new_annos += long_character_anno
|
||||
|
||||
@@ -27,8 +36,19 @@ if __name__ == "__main__":
|
||||
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
|
||||
# Source 3 (Optional): sampled audios as extra training helpers
|
||||
if args.add_auxiliary_data:
|
||||
with open("sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
|
||||
with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
|
||||
old_annos = f.readlines()
|
||||
# filter old_annos according to supported languages
|
||||
filtered_old_annos = []
|
||||
for line in old_annos:
|
||||
for lang in langs:
|
||||
if lang in line:
|
||||
filtered_old_annos.append(line)
|
||||
old_annos = filtered_old_annos
|
||||
for line in old_annos:
|
||||
path, speaker, text = line.split("|")
|
||||
if speaker not in speakers:
|
||||
speakers.append(speaker)
|
||||
num_old_voices = len(old_annos)
|
||||
num_new_voices = len(new_annos)
|
||||
# STEP 1: balance number of new & old voices
|
||||
@@ -44,14 +64,13 @@ if __name__ == "__main__":
|
||||
# assign ids to new speakers
|
||||
speaker2id = {}
|
||||
for i, speaker in enumerate(speakers):
|
||||
speaker2id[speaker] = hps['data']["n_speakers"] + i
|
||||
speaker2id[speaker] = i
|
||||
# modify n_speakers
|
||||
hps['data']["n_speakers"] = hps['data']["n_speakers"] + len(speakers)
|
||||
# add speaker names
|
||||
for speaker in speakers:
|
||||
hps['speakers'][speaker] = speaker2id[speaker]
|
||||
hps['train']['log_interval'] = 100
|
||||
hps['train']['eval_interval'] = 1000
|
||||
hps['data']["n_speakers"] = len(speakers)
|
||||
# overwrite speaker names
|
||||
hps['speakers'] = speaker2id
|
||||
hps['train']['log_interval'] = 10
|
||||
hps['train']['eval_interval'] = 100
|
||||
hps['train']['batch_size'] = 16
|
||||
hps['data']['training_files'] = "final_annotation_train.txt"
|
||||
hps['data']['validation_files'] = "final_annotation_val.txt"
|
||||
@@ -69,14 +88,22 @@ if __name__ == "__main__":
|
||||
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
|
||||
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
||||
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
|
||||
cleaned_old_annos = []
|
||||
for i, line in enumerate(old_annos):
|
||||
path, speaker, txt = line.split("|")
|
||||
if len(txt) > 150:
|
||||
continue
|
||||
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
|
||||
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
||||
cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
|
||||
# merge with old annotation
|
||||
final_annos = old_annos + cc_duplicate * cleaned_new_annos
|
||||
final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
|
||||
# save annotation file
|
||||
with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
||||
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
||||
for line in final_annos:
|
||||
f.write(line)
|
||||
# save annotation file for validation
|
||||
with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
||||
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
||||
for line in cleaned_new_annos:
|
||||
f.write(line)
|
||||
print("finished")
|
||||
@@ -117,11 +144,11 @@ if __name__ == "__main__":
|
||||
|
||||
final_annos = cleaned_new_annos
|
||||
# save annotation file
|
||||
with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
||||
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
||||
for line in final_annos:
|
||||
f.write(line)
|
||||
# save annotation file for validation
|
||||
with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
||||
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
||||
for line in cleaned_new_annos:
|
||||
f.write(line)
|
||||
print("finished")
|
||||
print("finished")
|
||||
|
||||
+10
-8
@@ -1,13 +1,15 @@
|
||||
Cython
|
||||
librosa==0.9.1
|
||||
numpy
|
||||
Cython==0.29.21
|
||||
librosa==0.9.2
|
||||
matplotlib==3.3.1
|
||||
scikit-learn==1.0.2
|
||||
scipy
|
||||
numpy==1.21.6
|
||||
tensorboard
|
||||
torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
torch
|
||||
torchvision
|
||||
torchaudio
|
||||
unidecode
|
||||
pyopenjtalk
|
||||
pyopenjtalk-prebuilt
|
||||
jamo
|
||||
pypinyin
|
||||
jieba
|
||||
@@ -20,5 +22,5 @@ indic_transliteration==2.3.37
|
||||
num_thai==0.0.5
|
||||
opencc==1.1.1
|
||||
demucs
|
||||
openai-whisper
|
||||
git+https://github.com/openai/whisper.git
|
||||
gradio
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import os
|
||||
import json
|
||||
import torchaudio
|
||||
raw_audio_dir = "./raw_audio/"
|
||||
denoise_audio_dir = "./denoised_audio/"
|
||||
filelist = list(os.walk(raw_audio_dir))[0][2]
|
||||
|
||||
# 2023/4/21: Get the target sampling rate
|
||||
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
||||
hps = json.load(f)
|
||||
target_sr = hps['data']['sampling_rate']
|
||||
for file in filelist:
|
||||
if file.endswith(".wav"):
|
||||
os.system(f"demucs --two-stems=vocals {raw_audio_dir}{file}")
|
||||
@@ -13,6 +17,6 @@ for file in filelist:
|
||||
channels_first=True)
|
||||
# merge two channels into one
|
||||
wav = wav.mean(dim=0).unsqueeze(0)
|
||||
if sr != 22050:
|
||||
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
|
||||
torchaudio.save(denoise_audio_dir + file + ".wav", wav, 22050, channels_first=True)
|
||||
if sr != target_sr:
|
||||
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
||||
torchaudio.save(denoise_audio_dir + file + ".wav", wav, target_sr, channels_first=True)
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from google.colab import files
|
||||
|
||||
basepath = os.getcwd()
|
||||
uploaded = files.upload() # 上传文件
|
||||
for filename in uploaded.keys():
|
||||
assert (filename.endswith(".txt")), "speaker-videolink info could only be .txt file!"
|
||||
shutil.move(os.path.join(basepath, filename), os.path.join("./speaker_links.txt"))
|
||||
|
||||
|
||||
def generate_infos():
|
||||
infos = []
|
||||
with open("./speaker_links.txt", 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line = line.replace("\n", "").replace(" ", "")
|
||||
if line == "":
|
||||
continue
|
||||
speaker, link = line.split("|")
|
||||
filename = speaker + "_" + str(random.randint(0, 1000000))
|
||||
infos.append({"link": link, "filename": filename})
|
||||
return infos
|
||||
|
||||
|
||||
def download_video(info):
|
||||
link = info["link"]
|
||||
filename = info["filename"]
|
||||
os.system(f"youtube-dl -f 0 {link} -o ./video_data/{filename}.mp4 --no-check-certificate")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
infos = generate_infos()
|
||||
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
|
||||
executor.map(download_video, infos)
|
||||
@@ -1,6 +1,7 @@
|
||||
from moviepy.editor import AudioFileClip
|
||||
import whisper
|
||||
import os
|
||||
import json
|
||||
import torchaudio
|
||||
import librosa
|
||||
import torch
|
||||
@@ -27,13 +28,17 @@ if __name__ == "__main__":
|
||||
lang2token = {
|
||||
'zh': "[ZH]",
|
||||
}
|
||||
assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
||||
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
||||
hps = json.load(f)
|
||||
target_sr = hps['data']['sampling_rate']
|
||||
model = whisper.load_model(args.whisper_size)
|
||||
speaker_annos = []
|
||||
for file in filelist:
|
||||
print(f"transcribing {parent_dir + file}...\n")
|
||||
options = dict(beam_size=5, best_of=5)
|
||||
transcribe_options = dict(task="transcribe", **options)
|
||||
result = model.transcribe(parent_dir + file, **transcribe_options)
|
||||
result = model.transcribe(parent_dir + file, word_timestamps=True, **transcribe_options)
|
||||
segments = result["segments"]
|
||||
# result = model.transcribe(parent_dir + file)
|
||||
lang = result['language']
|
||||
@@ -58,12 +63,13 @@ if __name__ == "__main__":
|
||||
wav_seg_name = f"{character_name}_{code}_{i}.wav"
|
||||
savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
|
||||
speaker_annos.append(savepth + "|" + character_name + "|" + text)
|
||||
print(f"Transcribed segment: {speaker_annos[-1]}")
|
||||
# trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
|
||||
# trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
|
||||
torchaudio.save(savepth, wav_seg, 22050, channels_first=True)
|
||||
torchaudio.save(savepth, wav_seg, target_sr, channels_first=True)
|
||||
if len(speaker_annos) == 0:
|
||||
print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
|
||||
print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
|
||||
with open("long_character_anno.txt", 'w', encoding='utf-8') as f:
|
||||
with open("./long_character_anno.txt", 'w', encoding='utf-8') as f:
|
||||
for line in speaker_annos:
|
||||
f.write(line)
|
||||
@@ -0,0 +1,20 @@
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import torchaudio
|
||||
|
||||
|
||||
def main():
|
||||
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
||||
hps = json.load(f)
|
||||
target_sr = hps['data']['sampling_rate']
|
||||
filelist = list(os.walk("./sampled_audio4ft"))[0][2]
|
||||
if target_sr != 22050:
|
||||
for wavfile in filelist:
|
||||
wav, sr = torchaudio.load("./sampled_audio4ft" + "/" + wavfile, frame_offset=0, num_frames=-1,
|
||||
normalize=True, channels_first=True)
|
||||
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
||||
torchaudio.save("./sampled_audio4ft" + "/" + wavfile, wav, target_sr, channels_first=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,7 +1,9 @@
|
||||
import whisper
|
||||
import os
|
||||
import json
|
||||
import torchaudio
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
lang2token = {
|
||||
'zh': "[ZH]",
|
||||
@@ -21,7 +23,7 @@ def transcribe_one(audio_path):
|
||||
print(f"Detected language: {max(probs, key=probs.get)}")
|
||||
lang = max(probs, key=probs.get)
|
||||
# decode the audio
|
||||
options = whisper.DecodingOptions()
|
||||
options = whisper.DecodingOptions(beam_size=5)
|
||||
result = whisper.decode(model, mel, options)
|
||||
|
||||
# print the recognized text
|
||||
@@ -47,11 +49,18 @@ if __name__ == "__main__":
|
||||
lang2token = {
|
||||
'zh': "[ZH]",
|
||||
}
|
||||
assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
||||
model = whisper.load_model(args.whisper_size)
|
||||
parent_dir = "./custom_character_voice/"
|
||||
speaker_names = list(os.walk(parent_dir))[0][1]
|
||||
speaker_annos = []
|
||||
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
|
||||
# resample audios
|
||||
# 2023/4/21: Get the target sampling rate
|
||||
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
||||
hps = json.load(f)
|
||||
target_sr = hps['data']['sampling_rate']
|
||||
processed_files = 0
|
||||
for speaker in speaker_names:
|
||||
for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
|
||||
# try to load file as audio
|
||||
@@ -61,12 +70,12 @@ if __name__ == "__main__":
|
||||
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
|
||||
channels_first=True)
|
||||
wav = wav.mean(dim=0).unsqueeze(0)
|
||||
if sr != 22050:
|
||||
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
|
||||
if sr != target_sr:
|
||||
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
||||
if wav.shape[1] / sr > 20:
|
||||
print(f"{wavfile} too long, ignoring\n")
|
||||
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
|
||||
torchaudio.save(save_path, wav, 22050, channels_first=True)
|
||||
torchaudio.save(save_path, wav, target_sr, channels_first=True)
|
||||
# transcribe text
|
||||
lang, text = transcribe_one(save_path)
|
||||
if lang not in list(lang2token.keys()):
|
||||
@@ -74,6 +83,9 @@ if __name__ == "__main__":
|
||||
continue
|
||||
text = lang2token[lang] + text + lang2token[lang] + "\n"
|
||||
speaker_annos.append(save_path + "|" + speaker + "|" + text)
|
||||
|
||||
processed_files += 1
|
||||
print(f"Processed: {processed_files}/{total_files}")
|
||||
except:
|
||||
continue
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from moviepy.editor import AudioFileClip
|
||||
|
||||
video_dir = "./video_data/"
|
||||
audio_dir = "./raw_audio/"
|
||||
filelist = list(os.walk(video_dir))[0][2]
|
||||
|
||||
|
||||
def generate_infos():
|
||||
videos = []
|
||||
for file in filelist:
|
||||
if file.endswith(".mp4"):
|
||||
videos.append(file)
|
||||
return videos
|
||||
|
||||
|
||||
def clip_file(file):
|
||||
my_audio_clip = AudioFileClip(video_dir + file)
|
||||
my_audio_clip.write_audiofile(audio_dir + file.rstrip("mp4") + "wav")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
infos = generate_infos()
|
||||
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
|
||||
executor.map(clip_file, infos)
|
||||
+6
-11
@@ -29,19 +29,14 @@ def korean_cleaners(text):
|
||||
return text
|
||||
|
||||
|
||||
# def chinese_cleaners(text):
|
||||
# '''Pipeline for Chinese text'''
|
||||
# text = number_to_chinese(text)
|
||||
# text = chinese_to_bopomofo(text)
|
||||
# text = latin_to_bopomofo(text)
|
||||
# text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
||||
# return text
|
||||
|
||||
def chinese_cleaners(text):
|
||||
from pypinyin import Style, pinyin
|
||||
'''Pipeline for Chinese text'''
|
||||
text = text.replace("[ZH]", "")
|
||||
phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
|
||||
return ' '.join(phones)
|
||||
text = number_to_chinese(text)
|
||||
text = chinese_to_bopomofo(text)
|
||||
text = latin_to_bopomofo(text)
|
||||
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
||||
return text
|
||||
|
||||
|
||||
def zh_ja_mixture_cleaners(text):
|
||||
|
||||
@@ -8,6 +8,7 @@ import subprocess
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
import regex as re
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
@@ -15,6 +16,135 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
logger = logging
|
||||
|
||||
|
||||
|
||||
zh_pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||||
en_pattern = re.compile(r'[a-zA-Z]')
|
||||
jp_pattern = re.compile(r'[\u3040-\u30ff\u31f0-\u31ff]')
|
||||
kr_pattern = re.compile(r'[\uac00-\ud7af\u1100-\u11ff\u3130-\u318f\ua960-\ua97f]')
|
||||
num_pattern=re.compile(r'[0-9]')
|
||||
comma=r"(?<=[.。!!??;;,,、::'\"‘“”’()()《》「」~——])" #向前匹配但固定长度
|
||||
tags={'ZH':'[ZH]','EN':'[EN]','JP':'[JA]','KR':'[KR]'}
|
||||
|
||||
def tag_cjke(text):
|
||||
'''为中英日韩加tag,中日正则分不开,故先分句分离中日再识别,以应对大部分情况'''
|
||||
sentences = re.split(r"([.。!!??;;,,、::'\"‘“”’()()【】《》「」~——]+ *(?![0-9]))", text) #分句,排除小数点
|
||||
sentences.append("")
|
||||
sentences = ["".join(i) for i in zip(sentences[0::2],sentences[1::2])]
|
||||
# print(sentences)
|
||||
prev_lang=None
|
||||
tagged_text = ""
|
||||
for s in sentences:
|
||||
#全为符号跳过
|
||||
nu = re.sub(r'[\s\p{P}]+', '', s, flags=re.U).strip()
|
||||
if len(nu)==0:
|
||||
continue
|
||||
s = re.sub(r'[()()《》「」【】‘“”’]+', '', s)
|
||||
jp=re.findall(jp_pattern, s)
|
||||
#本句含日语字符判断为日语
|
||||
if len(jp)>0:
|
||||
prev_lang,tagged_jke=tag_jke(s,prev_lang)
|
||||
tagged_text +=tagged_jke
|
||||
else:
|
||||
prev_lang,tagged_cke=tag_cke(s,prev_lang)
|
||||
tagged_text +=tagged_cke
|
||||
return tagged_text
|
||||
|
||||
def tag_jke(text,prev_sentence=None):
|
||||
'''为英日韩加tag'''
|
||||
# 初始化标记变量
|
||||
tagged_text = ""
|
||||
prev_lang = None
|
||||
tagged=0
|
||||
# 遍历文本
|
||||
for char in text:
|
||||
# 判断当前字符属于哪种语言
|
||||
if jp_pattern.match(char):
|
||||
lang = "JP"
|
||||
elif zh_pattern.match(char):
|
||||
lang = "JP"
|
||||
elif kr_pattern.match(char):
|
||||
lang = "KR"
|
||||
elif en_pattern.match(char):
|
||||
lang = "EN"
|
||||
# elif num_pattern.match(char):
|
||||
# lang = prev_sentence
|
||||
else:
|
||||
lang = None
|
||||
tagged_text += char
|
||||
continue
|
||||
# 如果当前语言与上一个语言不同,就添加标记
|
||||
if lang != prev_lang:
|
||||
tagged=1
|
||||
if prev_lang==None: # 开头
|
||||
tagged_text =tags[lang]+tagged_text
|
||||
else:
|
||||
tagged_text =tagged_text+tags[prev_lang]+tags[lang]
|
||||
|
||||
# 重置标记变量
|
||||
prev_lang = lang
|
||||
|
||||
# 添加当前字符到标记文本中
|
||||
tagged_text += char
|
||||
|
||||
# 在最后一个语言的结尾添加对应的标记
|
||||
if prev_lang:
|
||||
tagged_text += tags[prev_lang]
|
||||
if not tagged:
|
||||
prev_lang=prev_sentence
|
||||
tagged_text =tags[prev_lang]+tagged_text+tags[prev_lang]
|
||||
|
||||
return prev_lang,tagged_text
|
||||
|
||||
def tag_cke(text,prev_sentence=None):
|
||||
'''为中英韩加tag'''
|
||||
# 初始化标记变量
|
||||
tagged_text = ""
|
||||
prev_lang = None
|
||||
# 是否全略过未标签
|
||||
tagged=0
|
||||
|
||||
# 遍历文本
|
||||
for char in text:
|
||||
# 判断当前字符属于哪种语言
|
||||
if zh_pattern.match(char):
|
||||
lang = "ZH"
|
||||
elif kr_pattern.match(char):
|
||||
lang = "KR"
|
||||
elif en_pattern.match(char):
|
||||
lang = "EN"
|
||||
# elif num_pattern.match(char):
|
||||
# lang = prev_sentence
|
||||
else:
|
||||
# 略过
|
||||
lang = None
|
||||
tagged_text += char
|
||||
continue
|
||||
|
||||
# 如果当前语言与上一个语言不同,添加标记
|
||||
if lang != prev_lang:
|
||||
tagged=1
|
||||
if prev_lang==None: # 开头
|
||||
tagged_text =tags[lang]+tagged_text
|
||||
else:
|
||||
tagged_text =tagged_text+tags[prev_lang]+tags[lang]
|
||||
|
||||
# 重置标记变量
|
||||
prev_lang = lang
|
||||
|
||||
# 添加当前字符到标记文本中
|
||||
tagged_text += char
|
||||
|
||||
# 在最后一个语言的结尾添加对应的标记
|
||||
if prev_lang:
|
||||
tagged_text += tags[prev_lang]
|
||||
# 未标签则继承上一句标签
|
||||
if tagged==0:
|
||||
prev_lang=prev_sentence
|
||||
tagged_text =tags[prev_lang]+tagged_text+tags[prev_lang]
|
||||
return prev_lang,tagged_text
|
||||
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer=None, drop_speaker_emb=False):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||
@@ -74,14 +204,29 @@ def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios=
|
||||
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
||||
|
||||
|
||||
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
||||
def extract_digits(f):
|
||||
digits = "".join(filter(str.isdigit, f))
|
||||
return int(digits) if digits else -1
|
||||
|
||||
|
||||
def latest_checkpoint_path(dir_path, regex="G_[0-9]*.pth"):
|
||||
f_list = glob.glob(os.path.join(dir_path, regex))
|
||||
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
||||
f_list.sort(key=lambda f: extract_digits(f))
|
||||
x = f_list[-1]
|
||||
print(x)
|
||||
print(f"latest_checkpoint_path:{x}")
|
||||
return x
|
||||
|
||||
|
||||
def oldest_checkpoint_path(dir_path, regex="G_[0-9]*.pth", preserved=4):
|
||||
f_list = glob.glob(os.path.join(dir_path, regex))
|
||||
f_list.sort(key=lambda f: extract_digits(f))
|
||||
if len(f_list) > preserved:
|
||||
x = f_list[0]
|
||||
print(f"oldest_checkpoint_path:{x}")
|
||||
return x
|
||||
return ""
|
||||
|
||||
|
||||
def plot_spectrogram_to_numpy(spectrogram):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
@@ -148,6 +293,17 @@ def load_filepaths_and_text(filename, split="|"):
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||
return True
|
||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Boolean value expected.')
|
||||
|
||||
|
||||
def get_hparams(init=True):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--config', type=str, default="./configs/modified_finetune_speaker.json",
|
||||
@@ -156,7 +312,12 @@ def get_hparams(init=True):
|
||||
help='Model name')
|
||||
parser.add_argument('-n', '--max_epochs', type=int, default=50,
|
||||
help='finetune epochs')
|
||||
parser.add_argument('--drop_speaker_embed', type=bool, default=False, help='whether to drop existing characters')
|
||||
parser.add_argument('--cont', type=str2bool, default=False, help='whether to continue training on the latest checkpoint')
|
||||
parser.add_argument('--drop_speaker_embed', type=str2bool, default=False, help='whether to drop existing characters')
|
||||
parser.add_argument('--train_with_pretrained_model', type=str2bool, default=True,
|
||||
help='whether to train with pretrained model')
|
||||
parser.add_argument('--preserved', type=int, default=4,
|
||||
help='Number of preserved models')
|
||||
|
||||
args = parser.parse_args()
|
||||
model_dir = os.path.join("./", args.model)
|
||||
@@ -179,7 +340,10 @@ def get_hparams(init=True):
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
hparams.max_epochs = args.max_epochs
|
||||
hparams.cont = args.cont
|
||||
hparams.drop_speaker_embed = args.drop_speaker_embed
|
||||
hparams.train_with_pretrained_model = args.train_with_pretrained_model
|
||||
hparams.preserved = args.preserved
|
||||
return hparams
|
||||
|
||||
|
||||
@@ -231,7 +395,7 @@ def get_logger(model_dir, filename="train.log"):
|
||||
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename))
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename),encoding="utf-8")
|
||||
h.setLevel(logging.DEBUG)
|
||||
h.setFormatter(formatter)
|
||||
logger.addHandler(h)
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
from moviepy.editor import AudioFileClip
|
||||
import os
|
||||
video_dir = "./video_data/"
|
||||
audio_dir = "./raw_audio/"
|
||||
filelist = list(os.walk(video_dir))[0][2]
|
||||
for file in filelist:
|
||||
if file.endswith(".mp4"):
|
||||
my_audio_clip = AudioFileClip(video_dir + file)
|
||||
my_audio_clip.write_audiofile(audio_dir + file.rstrip(".mp4") + ".wav")
|
||||
|
||||
Reference in New Issue
Block a user