diff --git a/Dockerfile.mac b/Dockerfile.mac
new file mode 100644
index 000000000..596ae5679
--- /dev/null
+++ b/Dockerfile.mac
@@ -0,0 +1,37 @@
+FROM python:3.10-slim
+
+EXPOSE 7865
+
+WORKDIR /app
+
+RUN apt-get update && \
+ apt-get install -y -qq ffmpeg aria2 git build-essential && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+
+COPY requirements.txt .
+
+RUN pip install --upgrade "pip<24.1" && \
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+ pip install --no-cache-dir -r requirements.txt && \
+ pip install fairseq==0.12.2 && \
+ pip install gradio==3.34.0 gradio-client==0.2.7
+
+COPY . .
+
+
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d assets/pretrained_v2/ -o D40k.pth && \
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d assets/pretrained_v2/ -o G40k.pth && \
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d assets/pretrained_v2/ -o f0D40k.pth && \
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d assets/pretrained_v2/ -o f0G40k.pth
+
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d assets/hubert -o hubert_base.pt && \
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d assets/rmvpe -o rmvpe.pt
+
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth" -d assets/uvr5_weights/ -o "HP2-人声vocals+非人声instrumentals.pth" && \
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth" -d assets/uvr5_weights/ -o "HP5-主旋律人声vocals+其他instrumentals.pth"
+
+VOLUME [ "/app/weights", "/app/logs", "/app/assets/weights" ]
+
+CMD ["python", "infer-web.py"]
diff --git a/configs/config.py b/configs/config.py
index a330fb543..47edad7f5 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -167,7 +167,8 @@ def device_config(self) -> tuple:
self.preprocess_per = 3.0
elif self.has_mps():
logger.info("No supported Nvidia GPU found")
- self.device = self.instead = "mps"
+ logger.info("MPS available but using CPU for stability")
+ self.device = self.instead = "cpu"
self.is_half = False
self.use_fp32_config()
else:
diff --git a/datasets/README.md b/datasets/README.md
new file mode 100644
index 000000000..554aca12f
--- /dev/null
+++ b/datasets/README.md
@@ -0,0 +1,36 @@
+# Voice Datasets
+
+This directory contains the audio datasets for training custom RVC models.
+
+## Structure
+
+Each subdirectory corresponds to a specific voice type:
+
+- `male_low/`: Bass/Baritone male voices
+- `male_mid/`: Tenor/Mid-range male voices
+- `female_low/`: Alto/Contralto female voices
+- `female_high/`: Soprano/High-range female voices
+- `anime_airy/`: Breath/Airy anime-style voices
+- `accent_non_native/`: Voices with distinct non-native accents
+- `singing_male/`: Male singing vocals
+- `singing_female/`: Female singing vocals
+- `child/`: Child voices
+- `elderly/`: Elderly voices
+
+## How to Add Data
+
+1. **Collect Audio**: Gather 10-15 minutes of clean, single-speaker audio for the desired category.
+2. **Place Files**: Put the raw audio files (mp3, wav, etc.) into a temporary folder or directly here.
+3. **Process**: Use the provided tool to normalize and split the audio.
+
+```bash
+# Example: Processing a raw file into the male_low dataset
+python tools/audio_preprocessor.py -i raw_audio/my_voice.mp3 -o datasets/male_low
+```
+
+## Requirements
+
+- **Format**: WAV (will be converted automatically)
+- **Sample Rate**: 40kHz or 48kHz (will be converted automatically)
+- **Channels**: Mono (will be converted automatically)
+- **Quality**: No background noise, music, or reverb. Use UVR5 to clean if necessary.
diff --git a/docker-compose.mac.yml b/docker-compose.mac.yml
new file mode 100644
index 000000000..94472795d
--- /dev/null
+++ b/docker-compose.mac.yml
@@ -0,0 +1,18 @@
+version: '3.8'
+
+services:
+ rvc-webui:
+ build:
+ context: .
+ dockerfile: Dockerfile.mac
+ ports:
+ - "7865:7865"
+ volumes:
+ - ./weights:/app/weights
+ - ./logs:/app/logs
+ - ./assets/weights:/app/assets/weights
+ - ./datasets:/app/datasets
+ environment:
+ - PYTHONUNBUFFERED=1
+ restart: unless-stopped
+ platform: linux/amd64
diff --git a/experiments/output_test.wav b/experiments/output_test.wav
new file mode 100644
index 000000000..468ab13b6
Binary files /dev/null and b/experiments/output_test.wav differ
diff --git a/experiments/voice1_to_voice2.wav b/experiments/voice1_to_voice2.wav
new file mode 100644
index 000000000..8095fa0fb
Binary files /dev/null and b/experiments/voice1_to_voice2.wav differ
diff --git a/infer-web.py b/infer-web.py
index 47596d539..eade72a28 100644
--- a/infer-web.py
+++ b/infer-web.py
@@ -114,6 +114,11 @@ def forward_dml(ctx, x, scale):
if if_gpu_ok and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
default_batch_size = min(mem) // 2
+elif torch.backends.mps.is_available():
+ if_gpu_ok = True
+ gpu_infos.append("0\tApple Silicon MPS")
+ gpu_info = "Apple Silicon MPS detected"
+ default_batch_size = 4
else:
gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
default_batch_size = 1
@@ -220,6 +225,14 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
f.close()
+
+ # Verify trainset_dir exists
+ if not os.path.exists(trainset_dir):
+ error_msg = f"Training folder does not exist: {trainset_dir}"
+ logger.error(error_msg)
+ yield error_msg
+ return
+
cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % (
config.python_cmd,
trainset_dir,
@@ -231,8 +244,19 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
config.preprocess_per,
)
logger.info("Execute: " + cmd)
- # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
- p = Popen(cmd, shell=True)
+ print(f"Starting preprocessing: {cmd}")
+ # Use shell=False with proper argument list for better reliability
+ cmd_args = [
+ config.python_cmd,
+ "infer/modules/train/preprocess.py",
+ trainset_dir,
+ str(sr),
+ str(n_p),
+ f"{now_dir}/logs/{exp_dir}",
+ str(config.noparallel),
+ str(config.preprocess_per),
+ ]
+ p = Popen(cmd_args, cwd=now_dir)
# 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
diff --git a/infer-web.pyi b/infer-web.pyi
new file mode 100644
index 000000000..9b6cfe35f
--- /dev/null
+++ b/infer-web.pyi
@@ -0,0 +1,1625 @@
+import os
+import sys
+from dotenv import load_dotenv
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+load_dotenv()
+from infer.modules.vc.modules import VC
+from infer.modules.uvr5.modules import uvr
+from infer.lib.train.process_ckpt import (
+ change_info,
+ extract_small_model,
+ merge,
+ show_info,
+)
+from i18n.i18n import I18nAuto
+from configs.config import Config
+from sklearn.cluster import MiniBatchKMeans
+import torch, platform
+import numpy as np
+import gradio as gr
+import faiss
+import fairseq
+import pathlib
+import json
+from time import sleep
+from subprocess import Popen
+from random import shuffle
+import warnings
+import traceback
+import threading
+import shutil
+import logging
+
+
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
+logger = logging.getLogger(__name__)
+
+tmp = os.path.join(now_dir, "TEMP")
+shutil.rmtree(tmp, ignore_errors=True)
+shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
+shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
+os.makedirs(tmp, exist_ok=True)
+os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
+os.makedirs(os.path.join(now_dir, "assets/weights"), exist_ok=True)
+os.environ["TEMP"] = tmp
+warnings.filterwarnings("ignore")
+torch.manual_seed(114514)
+
+
+config = Config()
+vc = VC(config)
+
+
+if config.dml == True:
+
+ def forward_dml(ctx, x, scale):
+ ctx.scale = scale
+ res = x.clone().detach()
+ return res
+
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+i18n = I18nAuto()
+logger.info(i18n)
+# 判断是否有能用来训练和加速推理的N卡
+ngpu = torch.cuda.device_count()
+gpu_infos = []
+mem = []
+if_gpu_ok = False
+
+if torch.cuda.is_available() or ngpu != 0:
+ for i in range(ngpu):
+ gpu_name = torch.cuda.get_device_name(i)
+ if any(
+ value in gpu_name.upper()
+ for value in [
+ "10",
+ "16",
+ "20",
+ "30",
+ "40",
+ "A2",
+ "A3",
+ "A4",
+ "P4",
+ "A50",
+ "500",
+ "A60",
+ "70",
+ "80",
+ "90",
+ "M4",
+ "T4",
+ "TITAN",
+ "4060",
+ "L",
+ "6000",
+ ]
+ ):
+ # A10#A100#V100#A40#P40#M40#K80#A4500
+ if_gpu_ok = True # 至少有一张能用的N卡
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
+ mem.append(
+ int(
+ torch.cuda.get_device_properties(i).total_memory
+ / 1024
+ / 1024
+ / 1024
+ + 0.4
+ )
+ )
+if if_gpu_ok and len(gpu_infos) > 0:
+ gpu_info = "\n".join(gpu_infos)
+ default_batch_size = min(mem) // 2
+else:
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
+ default_batch_size = 1
+gpus = "-".join([i[0] for i in gpu_infos])
+
+from gradio.events import Dependency
+
+class ToolButton(gr.Button, gr.components.FormComponent):
+ """Small button with single emoji as text, fits inside gradio forms"""
+
+ def __init__(self, **kwargs):
+ super().__init__(variant="tool", **kwargs)
+
+ def get_block_name(self):
+ return "button"
+ from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
+ from gradio.blocks import Block
+ if TYPE_CHECKING:
+ from gradio.components import Timer
+ from gradio.components.base import Component
+
+
+weight_root = os.getenv("weight_root")
+weight_uvr5_root = os.getenv("weight_uvr5_root")
+index_root = os.getenv("index_root")
+outside_index_root = os.getenv("outside_index_root")
+
+names = []
+for name in os.listdir(weight_root):
+ if name.endswith(".pth"):
+ names.append(name)
+index_paths = []
+
+
+def lookup_indices(index_root):
+ global index_paths
+ for root, dirs, files in os.walk(index_root, topdown=False):
+ for name in files:
+ if name.endswith(".index") and "trained" not in name:
+ index_paths.append("%s/%s" % (root, name))
+
+
+lookup_indices(index_root)
+lookup_indices(outside_index_root)
+uvr5_names = []
+for name in os.listdir(weight_uvr5_root):
+ if name.endswith(".pth") or "onnx" in name:
+ uvr5_names.append(name.replace(".pth", ""))
+
+
+def change_choices():
+ names = []
+ for name in os.listdir(weight_root):
+ if name.endswith(".pth"):
+ names.append(name)
+ index_paths = []
+ for root, dirs, files in os.walk(index_root, topdown=False):
+ for name in files:
+ if name.endswith(".index") and "trained" not in name:
+ index_paths.append("%s/%s" % (root, name))
+ return {"choices": sorted(names), "__type__": "update"}, {
+ "choices": sorted(index_paths),
+ "__type__": "update",
+ }
+
+
+def clean():
+ return {"value": "", "__type__": "update"}
+
+
+def export_onnx(ModelPath, ExportedPath):
+ from infer.modules.onnx.export import export_onnx as eo
+
+ eo(ModelPath, ExportedPath)
+
+
+sr_dict = {
+ "32k": 32000,
+ "40k": 40000,
+ "48k": 48000,
+}
+
+
+def if_done(done, p):
+ while 1:
+ if p.poll() is None:
+ sleep(0.5)
+ else:
+ break
+ done[0] = True
+
+
+def if_done_multi(done, ps):
+ while 1:
+ # poll==None代表进程未结束
+ # 只要有一个进程未结束都不停
+ flag = 1
+ for p in ps:
+ if p.poll() is None:
+ flag = 0
+ sleep(0.5)
+ break
+ if flag == 1:
+ break
+ done[0] = True
+
+
+def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
+ sr = sr_dict[sr]
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
+ f.close()
+ cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % (
+ config.python_cmd,
+ trainset_dir,
+ sr,
+ n_p,
+ now_dir,
+ exp_dir,
+ config.noparallel,
+ config.preprocess_per,
+ )
+ logger.info("Execute: " + cmd)
+ # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
+ p = Popen(cmd, shell=True)
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done = [False]
+ threading.Thread(
+ target=if_done,
+ args=(
+ done,
+ p,
+ ),
+ ).start()
+ while 1:
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
+ yield (f.read())
+ sleep(1)
+ if done[0]:
+ break
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
+ log = f.read()
+ logger.info(log)
+ yield log
+
+
+# but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
+def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvpe):
+ gpus = gpus.split("-")
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
+ f.close()
+ if if_f0:
+ if f0method != "rmvpe_gpu":
+ cmd = (
+ '"%s" infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s'
+ % (
+ config.python_cmd,
+ now_dir,
+ exp_dir,
+ n_p,
+ f0method,
+ )
+ )
+ logger.info("Execute: " + cmd)
+ p = Popen(
+ cmd, shell=True, cwd=now_dir
+ ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done = [False]
+ threading.Thread(
+ target=if_done,
+ args=(
+ done,
+ p,
+ ),
+ ).start()
+ else:
+ if gpus_rmvpe != "-":
+ gpus_rmvpe = gpus_rmvpe.split("-")
+ leng = len(gpus_rmvpe)
+ ps = []
+ for idx, n_g in enumerate(gpus_rmvpe):
+ cmd = (
+ '"%s" infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s '
+ % (
+ config.python_cmd,
+ leng,
+ idx,
+ n_g,
+ now_dir,
+ exp_dir,
+ config.is_half,
+ )
+ )
+ logger.info("Execute: " + cmd)
+ p = Popen(
+ cmd, shell=True, cwd=now_dir
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
+ ps.append(p)
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done = [False]
+ threading.Thread(
+ target=if_done_multi, #
+ args=(
+ done,
+ ps,
+ ),
+ ).start()
+ else:
+ cmd = (
+ config.python_cmd
+ + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" '
+ % (
+ now_dir,
+ exp_dir,
+ )
+ )
+ logger.info("Execute: " + cmd)
+ p = Popen(
+ cmd, shell=True, cwd=now_dir
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
+ p.wait()
+ done = [True]
+ while 1:
+ with open(
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
+ ) as f:
+ yield (f.read())
+ sleep(1)
+ if done[0]:
+ break
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
+ log = f.read()
+ logger.info(log)
+ yield log
+ # 对不同part分别开多进程
+ """
+ n_part=int(sys.argv[1])
+ i_part=int(sys.argv[2])
+ i_gpu=sys.argv[3]
+ exp_dir=sys.argv[4]
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
+ """
+ leng = len(gpus)
+ ps = []
+ for idx, n_g in enumerate(gpus):
+ cmd = (
+ '"%s" infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s %s'
+ % (
+ config.python_cmd,
+ config.device,
+ leng,
+ idx,
+ n_g,
+ now_dir,
+ exp_dir,
+ version19,
+ config.is_half,
+ )
+ )
+ logger.info("Execute: " + cmd)
+ p = Popen(
+ cmd, shell=True, cwd=now_dir
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
+ ps.append(p)
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done = [False]
+ threading.Thread(
+ target=if_done_multi,
+ args=(
+ done,
+ ps,
+ ),
+ ).start()
+ while 1:
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
+ yield (f.read())
+ sleep(1)
+ if done[0]:
+ break
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
+ log = f.read()
+ logger.info(log)
+ yield log
+
+
+def get_pretrained_models(path_str, f0_str, sr2):
+ if_pretrained_generator_exist = os.access(
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
+ )
+ if_pretrained_discriminator_exist = os.access(
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
+ )
+ if not if_pretrained_generator_exist:
+ logger.warning(
+ "assets/pretrained%s/%sG%s.pth not exist, will not use pretrained model",
+ path_str,
+ f0_str,
+ sr2,
+ )
+ if not if_pretrained_discriminator_exist:
+ logger.warning(
+ "assets/pretrained%s/%sD%s.pth not exist, will not use pretrained model",
+ path_str,
+ f0_str,
+ sr2,
+ )
+ return (
+ (
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_generator_exist
+ else ""
+ ),
+ (
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_discriminator_exist
+ else ""
+ ),
+ )
+
+
+def change_sr2(sr2, if_f0_3, version19):
+ path_str = "" if version19 == "v1" else "_v2"
+ f0_str = "f0" if if_f0_3 else ""
+ return get_pretrained_models(path_str, f0_str, sr2)
+
+
+def change_version19(sr2, if_f0_3, version19):
+ path_str = "" if version19 == "v1" else "_v2"
+ if sr2 == "32k" and version19 == "v1":
+ sr2 = "40k"
+ to_return_sr2 = (
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
+ if version19 == "v1"
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
+ )
+ f0_str = "f0" if if_f0_3 else ""
+ return (
+ *get_pretrained_models(path_str, f0_str, sr2),
+ to_return_sr2,
+ )
+
+
+def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
+ path_str = "" if version19 == "v1" else "_v2"
+ return (
+ {"visible": if_f0_3, "__type__": "update"},
+ {"visible": if_f0_3, "__type__": "update"},
+ *get_pretrained_models(path_str, "f0" if if_f0_3 == True else "", sr2),
+ )
+
+
+# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
+def click_train(
+ exp_dir1,
+ sr2,
+ if_f0_3,
+ spk_id5,
+ save_epoch10,
+ total_epoch11,
+ batch_size12,
+ if_save_latest13,
+ pretrained_G14,
+ pretrained_D15,
+ gpus16,
+ if_cache_gpu17,
+ if_save_every_weights18,
+ version19,
+):
+ # 生成filelist
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
+ os.makedirs(exp_dir, exist_ok=True)
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
+ feature_dir = (
+ "%s/3_feature256" % (exp_dir)
+ if version19 == "v1"
+ else "%s/3_feature768" % (exp_dir)
+ )
+ if if_f0_3:
+ f0_dir = "%s/2a_f0" % (exp_dir)
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
+ names = (
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
+ )
+ else:
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
+ )
+ opt = []
+ for name in names:
+ if if_f0_3:
+ opt.append(
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
+ % (
+ gt_wavs_dir.replace("\\", "\\\\"),
+ name,
+ feature_dir.replace("\\", "\\\\"),
+ name,
+ f0_dir.replace("\\", "\\\\"),
+ name,
+ f0nsf_dir.replace("\\", "\\\\"),
+ name,
+ spk_id5,
+ )
+ )
+ else:
+ opt.append(
+ "%s/%s.wav|%s/%s.npy|%s"
+ % (
+ gt_wavs_dir.replace("\\", "\\\\"),
+ name,
+ feature_dir.replace("\\", "\\\\"),
+ name,
+ spk_id5,
+ )
+ )
+ fea_dim = 256 if version19 == "v1" else 768
+ if if_f0_3:
+ for _ in range(2):
+ opt.append(
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
+ )
+ else:
+ for _ in range(2):
+ opt.append(
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
+ )
+ shuffle(opt)
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
+ f.write("\n".join(opt))
+ logger.debug("Write filelist done")
+ # 生成config#无需生成config
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
+ logger.info("Use gpus: %s", str(gpus16))
+ if pretrained_G14 == "":
+ logger.info("No pretrained Generator")
+ if pretrained_D15 == "":
+ logger.info("No pretrained Discriminator")
+ if version19 == "v1" or sr2 == "40k":
+ config_path = "v1/%s.json" % sr2
+ else:
+ config_path = "v2/%s.json" % sr2
+ config_save_path = os.path.join(exp_dir, "config.json")
+ if not pathlib.Path(config_save_path).exists():
+ with open(config_save_path, "w", encoding="utf-8") as f:
+ json.dump(
+ config.json_config[config_path],
+ f,
+ ensure_ascii=False,
+ indent=4,
+ sort_keys=True,
+ )
+ f.write("\n")
+ if gpus16:
+ cmd = (
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
+ % (
+ config.python_cmd,
+ exp_dir1,
+ sr2,
+ 1 if if_f0_3 else 0,
+ batch_size12,
+ gpus16,
+ total_epoch11,
+ save_epoch10,
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
+ 1 if if_save_latest13 == i18n("是") else 0,
+ 1 if if_cache_gpu17 == i18n("是") else 0,
+ 1 if if_save_every_weights18 == i18n("是") else 0,
+ version19,
+ )
+ )
+ else:
+ cmd = (
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
+ % (
+ config.python_cmd,
+ exp_dir1,
+ sr2,
+ 1 if if_f0_3 else 0,
+ batch_size12,
+ total_epoch11,
+ save_epoch10,
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
+ 1 if if_save_latest13 == i18n("是") else 0,
+ 1 if if_cache_gpu17 == i18n("是") else 0,
+ 1 if if_save_every_weights18 == i18n("是") else 0,
+ version19,
+ )
+ )
+ logger.info("Execute: " + cmd)
+ p = Popen(cmd, shell=True, cwd=now_dir)
+ p.wait()
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
+
+
+# but4.click(train_index, [exp_dir1], info3)
+def train_index(exp_dir1, version19):
+ # exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
+ exp_dir = "logs/%s" % (exp_dir1)
+ os.makedirs(exp_dir, exist_ok=True)
+ feature_dir = (
+ "%s/3_feature256" % (exp_dir)
+ if version19 == "v1"
+ else "%s/3_feature768" % (exp_dir)
+ )
+ if not os.path.exists(feature_dir):
+ return "请先进行特征提取!"
+ listdir_res = list(os.listdir(feature_dir))
+ if len(listdir_res) == 0:
+ return "请先进行特征提取!"
+ infos = []
+ npys = []
+ for name in sorted(listdir_res):
+ phone = np.load("%s/%s" % (feature_dir, name))
+ npys.append(phone)
+ big_npy = np.concatenate(npys, 0)
+ big_npy_idx = np.arange(big_npy.shape[0])
+ np.random.shuffle(big_npy_idx)
+ big_npy = big_npy[big_npy_idx]
+ if big_npy.shape[0] > 2e5:
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
+ yield "\n".join(infos)
+ try:
+ big_npy = (
+ MiniBatchKMeans(
+ n_clusters=10000,
+ verbose=True,
+ batch_size=256 * config.n_cpu,
+ compute_labels=False,
+ init="random",
+ )
+ .fit(big_npy)
+ .cluster_centers_
+ )
+ except:
+ info = traceback.format_exc()
+ logger.info(info)
+ infos.append(info)
+ yield "\n".join(infos)
+
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
+ yield "\n".join(infos)
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
+ infos.append("training")
+ yield "\n".join(infos)
+ index_ivf = faiss.extract_index_ivf(index) #
+ index_ivf.nprobe = 1
+ index.train(big_npy)
+ faiss.write_index(
+ index,
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
+ )
+ infos.append("adding")
+ yield "\n".join(infos)
+ batch_size_add = 8192
+ for i in range(0, big_npy.shape[0], batch_size_add):
+ index.add(big_npy[i : i + batch_size_add])
+ faiss.write_index(
+ index,
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
+ )
+ infos.append(
+ "成功构建索引 added_IVF%s_Flat_nprobe_%s_%s_%s.index"
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
+ )
+ try:
+ link = os.link if platform.system() == "Windows" else os.symlink
+ link(
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
+ "%s/%s_IVF%s_Flat_nprobe_%s_%s_%s.index"
+ % (
+ outside_index_root,
+ exp_dir1,
+ n_ivf,
+ index_ivf.nprobe,
+ exp_dir1,
+ version19,
+ ),
+ )
+ infos.append("链接索引到外部-%s" % (outside_index_root))
+ except:
+ infos.append("链接索引到外部-%s失败" % (outside_index_root))
+
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
+ yield "\n".join(infos)
+
+
+# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
+def train1key(
+ exp_dir1,
+ sr2,
+ if_f0_3,
+ trainset_dir4,
+ spk_id5,
+ np7,
+ f0method8,
+ save_epoch10,
+ total_epoch11,
+ batch_size12,
+ if_save_latest13,
+ pretrained_G14,
+ pretrained_D15,
+ gpus16,
+ if_cache_gpu17,
+ if_save_every_weights18,
+ version19,
+ gpus_rmvpe,
+):
+ infos = []
+
+ def get_info_str(strr):
+ infos.append(strr)
+ return "\n".join(infos)
+
+ # step1:处理数据
+ yield get_info_str(i18n("step1:正在处理数据"))
+ [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)]
+
+ # step2a:提取音高
+ yield get_info_str(i18n("step2:正在提取音高&正在提取特征"))
+ [
+ get_info_str(_)
+ for _ in extract_f0_feature(
+ gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe
+ )
+ ]
+
+ # step3a:训练模型
+ yield get_info_str(i18n("step3a:正在训练模型"))
+ click_train(
+ exp_dir1,
+ sr2,
+ if_f0_3,
+ spk_id5,
+ save_epoch10,
+ total_epoch11,
+ batch_size12,
+ if_save_latest13,
+ pretrained_G14,
+ pretrained_D15,
+ gpus16,
+ if_cache_gpu17,
+ if_save_every_weights18,
+ version19,
+ )
+ yield get_info_str(
+ i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")
+ )
+
+ # step3b:训练索引
+ [get_info_str(_) for _ in train_index(exp_dir1, version19)]
+ yield get_info_str(i18n("全流程结束!"))
+
+
+# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
+def change_info_(ckpt_path):
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
+ try:
+ with open(
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
+ ) as f:
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
+ sr, f0 = info["sample_rate"], info["if_f0"]
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
+ return sr, str(f0), version
+ except:
+ traceback.print_exc()
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
+
+
+F0GPUVisible = config.dml == False
+
+
+def change_f0_method(f0method8):
+ if f0method8 == "rmvpe_gpu":
+ visible = F0GPUVisible
+ else:
+ visible = False
+ return {"visible": visible, "__type__": "update"}
+
+
+with gr.Blocks(title="RVC WebUI") as app:
+ gr.Markdown("## RVC WebUI")
+ gr.Markdown(
+ value=i18n(
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
+ )
+ )
+ with gr.Tabs():
+ with gr.TabItem(i18n("模型推理")):
+ with gr.Row():
+ sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
+ with gr.Column():
+ refresh_button = gr.Button(
+ i18n("刷新音色列表和索引路径"), variant="primary"
+ )
+ clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
+ spk_item = gr.Slider(
+ minimum=0,
+ maximum=2333,
+ step=1,
+ label=i18n("请选择说话人id"),
+ value=0,
+ visible=False,
+ interactive=True,
+ )
+ clean_button.click(
+ fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
+ )
+ with gr.TabItem(i18n("单次推理")):
+ with gr.Group():
+ with gr.Row():
+ with gr.Column():
+ vc_transform0 = gr.Number(
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
+ value=0,
+ )
+ input_audio0 = gr.Textbox(
+ label=i18n(
+ "输入待处理音频文件路径(默认是正确格式示例)"
+ ),
+ placeholder="C:\\Users\\Desktop\\audio_example.wav",
+ )
+ file_index1 = gr.Textbox(
+ label=i18n(
+ "特征检索库文件路径,为空则使用下拉的选择结果"
+ ),
+ placeholder="C:\\Users\\Desktop\\model_example.index",
+ interactive=True,
+ )
+ file_index2 = gr.Dropdown(
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
+ choices=sorted(index_paths),
+ interactive=True,
+ )
+ f0method0 = gr.Radio(
+ label=i18n(
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
+ ),
+ choices=(
+ ["pm", "harvest", "crepe", "rmvpe"]
+ if config.dml == False
+ else ["pm", "harvest", "rmvpe"]
+ ),
+ value="rmvpe",
+ interactive=True,
+ )
+
+ with gr.Column():
+ resample_sr0 = gr.Slider(
+ minimum=0,
+ maximum=48000,
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
+ value=0,
+ step=1,
+ interactive=True,
+ )
+ rms_mix_rate0 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n(
+ "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"
+ ),
+ value=0.25,
+ interactive=True,
+ )
+ protect0 = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n(
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
+ ),
+ value=0.33,
+ step=0.01,
+ interactive=True,
+ )
+ filter_radius0 = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n(
+ ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate1 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("检索特征占比"),
+ value=0.75,
+ interactive=True,
+ )
+ f0_file = gr.File(
+ label=i18n(
+ "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"
+ ),
+ visible=False,
+ )
+
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[],
+ outputs=[sid0, file_index2],
+ api_name="infer_refresh",
+ )
+ # file_big_npy1 = gr.Textbox(
+ # label=i18n("特征文件路径"),
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+ # interactive=True,
+ # )
+ with gr.Group():
+ with gr.Column():
+ but0 = gr.Button(i18n("转换"), variant="primary")
+ with gr.Row():
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
+ vc_output2 = gr.Audio(
+ label=i18n("输出音频(右下角三个点,点了可以下载)")
+ )
+
+ but0.click(
+ vc.vc_single,
+ [
+ spk_item,
+ input_audio0,
+ vc_transform0,
+ f0_file,
+ f0method0,
+ file_index1,
+ file_index2,
+ # file_big_npy1,
+ index_rate1,
+ filter_radius0,
+ resample_sr0,
+ rms_mix_rate0,
+ protect0,
+ ],
+ [vc_output1, vc_output2],
+ api_name="infer_convert",
+ )
+ with gr.TabItem(i18n("批量推理")):
+ gr.Markdown(
+ value=i18n(
+ "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. "
+ )
+ )
+ with gr.Row():
+ with gr.Column():
+ vc_transform1 = gr.Number(
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
+ value=0,
+ )
+ opt_input = gr.Textbox(
+ label=i18n("指定输出文件夹"), value="opt"
+ )
+ file_index3 = gr.Textbox(
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
+ value="",
+ interactive=True,
+ )
+ file_index4 = gr.Dropdown(
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
+ choices=sorted(index_paths),
+ interactive=True,
+ )
+ f0method1 = gr.Radio(
+ label=i18n(
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
+ ),
+ choices=(
+ ["pm", "harvest", "crepe", "rmvpe"]
+ if config.dml == False
+ else ["pm", "harvest", "rmvpe"]
+ ),
+ value="rmvpe",
+ interactive=True,
+ )
+ format1 = gr.Radio(
+ label=i18n("导出文件格式"),
+ choices=["wav", "flac", "mp3", "m4a"],
+ value="wav",
+ interactive=True,
+ )
+
+ refresh_button.click(
+ fn=lambda: change_choices()[1],
+ inputs=[],
+ outputs=file_index4,
+ api_name="infer_refresh_batch",
+ )
+ # file_big_npy2 = gr.Textbox(
+ # label=i18n("特征文件路径"),
+ # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+ # interactive=True,
+ # )
+
+ with gr.Column():
+ resample_sr1 = gr.Slider(
+ minimum=0,
+ maximum=48000,
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
+ value=0,
+ step=1,
+ interactive=True,
+ )
+ rms_mix_rate1 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n(
+ "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"
+ ),
+ value=1,
+ interactive=True,
+ )
+ protect1 = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n(
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
+ ),
+ value=0.33,
+ step=0.01,
+ interactive=True,
+ )
+ filter_radius1 = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n(
+ ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate2 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("检索特征占比"),
+ value=1,
+ interactive=True,
+ )
+ with gr.Row():
+ dir_input = gr.Textbox(
+ label=i18n(
+ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"
+ ),
+ placeholder="C:\\Users\\Desktop\\input_vocal_dir",
+ )
+ inputs = gr.File(
+ file_count="multiple",
+ label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
+ )
+
+ with gr.Row():
+ but1 = gr.Button(i18n("转换"), variant="primary")
+ vc_output3 = gr.Textbox(label=i18n("输出信息"))
+
+ but1.click(
+ vc.vc_multi,
+ [
+ spk_item,
+ dir_input,
+ opt_input,
+ inputs,
+ vc_transform1,
+ f0method1,
+ file_index3,
+ file_index4,
+ # file_big_npy2,
+ index_rate2,
+ filter_radius1,
+ resample_sr1,
+ rms_mix_rate1,
+ protect1,
+ format1,
+ ],
+ [vc_output3],
+ api_name="infer_convert_batch",
+ )
+ sid0.change(
+ fn=vc.get_vc,
+ inputs=[sid0, protect0, protect1],
+ outputs=[spk_item, protect0, protect1, file_index2, file_index4],
+ api_name="infer_change_voice",
+ )
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
+ with gr.Group():
+ gr.Markdown(
+ value=i18n(
+ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
+ )
+ )
+ with gr.Row():
+ with gr.Column():
+ dir_wav_input = gr.Textbox(
+ label=i18n("输入待处理音频文件夹路径"),
+ placeholder="C:\\Users\\Desktop\\todo-songs",
+ )
+ wav_inputs = gr.File(
+ file_count="multiple",
+ label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
+ )
+ with gr.Column():
+ model_choose = gr.Dropdown(
+ label=i18n("模型"), choices=uvr5_names
+ )
+ agg = gr.Slider(
+ minimum=0,
+ maximum=20,
+ step=1,
+ label="人声提取激进程度",
+ value=10,
+ interactive=True,
+ visible=False, # 先不开放调整
+ )
+ opt_vocal_root = gr.Textbox(
+ label=i18n("指定输出主人声文件夹"), value="opt"
+ )
+ opt_ins_root = gr.Textbox(
+ label=i18n("指定输出非主人声文件夹"), value="opt"
+ )
+ format0 = gr.Radio(
+ label=i18n("导出文件格式"),
+ choices=["wav", "flac", "mp3", "m4a"],
+ value="flac",
+ interactive=True,
+ )
+ but2 = gr.Button(i18n("转换"), variant="primary")
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
+ but2.click(
+ uvr,
+ [
+ model_choose,
+ dir_wav_input,
+ opt_vocal_root,
+ wav_inputs,
+ opt_ins_root,
+ agg,
+ format0,
+ ],
+ [vc_output4],
+ api_name="uvr_convert",
+ )
+ with gr.TabItem(i18n("训练")):
+ gr.Markdown(
+ value=i18n(
+ "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. "
+ )
+ )
+ with gr.Row():
+ exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test")
+ sr2 = gr.Radio(
+ label=i18n("目标采样率"),
+ choices=["40k", "48k"],
+ value="40k",
+ interactive=True,
+ )
+ if_f0_3 = gr.Radio(
+ label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"),
+ choices=[True, False],
+ value=True,
+ interactive=True,
+ )
+ version19 = gr.Radio(
+ label=i18n("版本"),
+ choices=["v1", "v2"],
+ value="v2",
+ interactive=True,
+ visible=True,
+ )
+ np7 = gr.Slider(
+ minimum=0,
+ maximum=config.n_cpu,
+ step=1,
+ label=i18n("提取音高和处理数据使用的CPU进程数"),
+ value=int(np.ceil(config.n_cpu / 1.5)),
+ interactive=True,
+ )
+ with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理
+ gr.Markdown(
+ value=i18n(
+ "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. "
+ )
+ )
+ with gr.Row():
+ trainset_dir4 = gr.Textbox(
+ label=i18n("输入训练文件夹路径"),
+ value=i18n("E:\\语音音频+标注\\米津玄师\\src"),
+ )
+ spk_id5 = gr.Slider(
+ minimum=0,
+ maximum=4,
+ step=1,
+ label=i18n("请指定说话人id"),
+ value=0,
+ interactive=True,
+ )
+ but1 = gr.Button(i18n("处理数据"), variant="primary")
+ info1 = gr.Textbox(label=i18n("输出信息"), value="")
+ but1.click(
+ preprocess_dataset,
+ [trainset_dir4, exp_dir1, sr2, np7],
+ [info1],
+ api_name="train_preprocess",
+ )
+ with gr.Group():
+ gr.Markdown(
+ value=i18n(
+ "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)"
+ )
+ )
+ with gr.Row():
+ with gr.Column():
+ gpus6 = gr.Textbox(
+ label=i18n(
+ "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"
+ ),
+ value=gpus,
+ interactive=True,
+ visible=F0GPUVisible,
+ )
+ gpu_info9 = gr.Textbox(
+ label=i18n("显卡信息"), value=gpu_info, visible=F0GPUVisible
+ )
+ with gr.Column():
+ f0method8 = gr.Radio(
+ label=i18n(
+ "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU"
+ ),
+ choices=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"],
+ value="rmvpe_gpu",
+ interactive=True,
+ )
+ gpus_rmvpe = gr.Textbox(
+ label=i18n(
+ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程"
+ ),
+ value="%s-%s" % (gpus, gpus),
+ interactive=True,
+ visible=F0GPUVisible,
+ )
+ but2 = gr.Button(i18n("特征提取"), variant="primary")
+ info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
+ f0method8.change(
+ fn=change_f0_method,
+ inputs=[f0method8],
+ outputs=[gpus_rmvpe],
+ )
+ but2.click(
+ extract_f0_feature,
+ [
+ gpus6,
+ np7,
+ f0method8,
+ if_f0_3,
+ exp_dir1,
+ version19,
+ gpus_rmvpe,
+ ],
+ [info2],
+ api_name="train_extract_f0_feature",
+ )
+ with gr.Group():
+ gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
+ with gr.Row():
+ save_epoch10 = gr.Slider(
+ minimum=1,
+ maximum=50,
+ step=1,
+ label=i18n("保存频率save_every_epoch"),
+ value=5,
+ interactive=True,
+ )
+ total_epoch11 = gr.Slider(
+ minimum=2,
+ maximum=1000,
+ step=1,
+ label=i18n("总训练轮数total_epoch"),
+ value=20,
+ interactive=True,
+ )
+ batch_size12 = gr.Slider(
+ minimum=1,
+ maximum=40,
+ step=1,
+ label=i18n("每张显卡的batch_size"),
+ value=default_batch_size,
+ interactive=True,
+ )
+ if_save_latest13 = gr.Radio(
+ label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"),
+ choices=[i18n("是"), i18n("否")],
+ value=i18n("否"),
+ interactive=True,
+ )
+ if_cache_gpu17 = gr.Radio(
+ label=i18n(
+ "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速"
+ ),
+ choices=[i18n("是"), i18n("否")],
+ value=i18n("否"),
+ interactive=True,
+ )
+ if_save_every_weights18 = gr.Radio(
+ label=i18n(
+ "是否在每次保存时间点将最终小模型保存至weights文件夹"
+ ),
+ choices=[i18n("是"), i18n("否")],
+ value=i18n("否"),
+ interactive=True,
+ )
+ with gr.Row():
+ pretrained_G14 = gr.Textbox(
+ label=i18n("加载预训练底模G路径"),
+ value="assets/pretrained_v2/f0G40k.pth",
+ interactive=True,
+ )
+ pretrained_D15 = gr.Textbox(
+ label=i18n("加载预训练底模D路径"),
+ value="assets/pretrained_v2/f0D40k.pth",
+ interactive=True,
+ )
+ sr2.change(
+ change_sr2,
+ [sr2, if_f0_3, version19],
+ [pretrained_G14, pretrained_D15],
+ )
+ version19.change(
+ change_version19,
+ [sr2, if_f0_3, version19],
+ [pretrained_G14, pretrained_D15, sr2],
+ )
+ if_f0_3.change(
+ change_f0,
+ [if_f0_3, sr2, version19],
+ [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15],
+ )
+ gpus16 = gr.Textbox(
+ label=i18n(
+ "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"
+ ),
+ value=gpus,
+ interactive=True,
+ )
+ but3 = gr.Button(i18n("训练模型"), variant="primary")
+ but4 = gr.Button(i18n("训练特征索引"), variant="primary")
+ but5 = gr.Button(i18n("一键训练"), variant="primary")
+ info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10)
+ but3.click(
+ click_train,
+ [
+ exp_dir1,
+ sr2,
+ if_f0_3,
+ spk_id5,
+ save_epoch10,
+ total_epoch11,
+ batch_size12,
+ if_save_latest13,
+ pretrained_G14,
+ pretrained_D15,
+ gpus16,
+ if_cache_gpu17,
+ if_save_every_weights18,
+ version19,
+ ],
+ info3,
+ api_name="train_start",
+ )
+ but4.click(train_index, [exp_dir1, version19], info3)
+ but5.click(
+ train1key,
+ [
+ exp_dir1,
+ sr2,
+ if_f0_3,
+ trainset_dir4,
+ spk_id5,
+ np7,
+ f0method8,
+ save_epoch10,
+ total_epoch11,
+ batch_size12,
+ if_save_latest13,
+ pretrained_G14,
+ pretrained_D15,
+ gpus16,
+ if_cache_gpu17,
+ if_save_every_weights18,
+ version19,
+ gpus_rmvpe,
+ ],
+ info3,
+ api_name="train_start_all",
+ )
+
+ with gr.TabItem(i18n("ckpt处理")):
+ with gr.Group():
+ gr.Markdown(value=i18n("模型融合, 可用于测试音色融合"))
+ with gr.Row():
+ ckpt_a = gr.Textbox(
+ label=i18n("A模型路径"), value="", interactive=True
+ )
+ ckpt_b = gr.Textbox(
+ label=i18n("B模型路径"), value="", interactive=True
+ )
+ alpha_a = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("A模型权重"),
+ value=0.5,
+ interactive=True,
+ )
+ with gr.Row():
+ sr_ = gr.Radio(
+ label=i18n("目标采样率"),
+ choices=["40k", "48k"],
+ value="40k",
+ interactive=True,
+ )
+ if_f0_ = gr.Radio(
+ label=i18n("模型是否带音高指导"),
+ choices=[i18n("是"), i18n("否")],
+ value=i18n("是"),
+ interactive=True,
+ )
+ info__ = gr.Textbox(
+ label=i18n("要置入的模型信息"),
+ value="",
+ max_lines=8,
+ interactive=True,
+ )
+ name_to_save0 = gr.Textbox(
+ label=i18n("保存的模型名不带后缀"),
+ value="",
+ max_lines=1,
+ interactive=True,
+ )
+ version_2 = gr.Radio(
+ label=i18n("模型版本型号"),
+ choices=["v1", "v2"],
+ value="v1",
+ interactive=True,
+ )
+ with gr.Row():
+ but6 = gr.Button(i18n("融合"), variant="primary")
+ info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
+ but6.click(
+ merge,
+ [
+ ckpt_a,
+ ckpt_b,
+ alpha_a,
+ sr_,
+ if_f0_,
+ info__,
+ name_to_save0,
+ version_2,
+ ],
+ info4,
+ api_name="ckpt_merge",
+ ) # def merge(path1,path2,alpha1,sr,f0,info):
+ with gr.Group():
+ gr.Markdown(
+ value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)")
+ )
+ with gr.Row():
+ ckpt_path0 = gr.Textbox(
+ label=i18n("模型路径"), value="", interactive=True
+ )
+ info_ = gr.Textbox(
+ label=i18n("要改的模型信息"),
+ value="",
+ max_lines=8,
+ interactive=True,
+ )
+ name_to_save1 = gr.Textbox(
+ label=i18n("保存的文件名, 默认空为和源文件同名"),
+ value="",
+ max_lines=8,
+ interactive=True,
+ )
+ with gr.Row():
+ but7 = gr.Button(i18n("修改"), variant="primary")
+ info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
+ but7.click(
+ change_info,
+ [ckpt_path0, info_, name_to_save1],
+ info5,
+ api_name="ckpt_modify",
+ )
+ with gr.Group():
+ gr.Markdown(
+ value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)")
+ )
+ with gr.Row():
+ ckpt_path1 = gr.Textbox(
+ label=i18n("模型路径"), value="", interactive=True
+ )
+ but8 = gr.Button(i18n("查看"), variant="primary")
+ info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
+ but8.click(show_info, [ckpt_path1], info6, api_name="ckpt_show")
+ with gr.Group():
+ gr.Markdown(
+ value=i18n(
+ "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况"
+ )
+ )
+ with gr.Row():
+ ckpt_path2 = gr.Textbox(
+ label=i18n("模型路径"),
+ value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth",
+ interactive=True,
+ )
+ save_name = gr.Textbox(
+ label=i18n("保存名"), value="", interactive=True
+ )
+ sr__ = gr.Radio(
+ label=i18n("目标采样率"),
+ choices=["32k", "40k", "48k"],
+ value="40k",
+ interactive=True,
+ )
+ if_f0__ = gr.Radio(
+ label=i18n("模型是否带音高指导,1是0否"),
+ choices=["1", "0"],
+ value="1",
+ interactive=True,
+ )
+ version_1 = gr.Radio(
+ label=i18n("模型版本型号"),
+ choices=["v1", "v2"],
+ value="v2",
+ interactive=True,
+ )
+ info___ = gr.Textbox(
+ label=i18n("要置入的模型信息"),
+ value="",
+ max_lines=8,
+ interactive=True,
+ )
+ but9 = gr.Button(i18n("提取"), variant="primary")
+ info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
+ ckpt_path2.change(
+ change_info_, [ckpt_path2], [sr__, if_f0__, version_1]
+ )
+ but9.click(
+ extract_small_model,
+ [ckpt_path2, save_name, sr__, if_f0__, info___, version_1],
+ info7,
+ api_name="ckpt_extract",
+ )
+
+ with gr.TabItem(i18n("Onnx导出")):
+ with gr.Row():
+ ckpt_dir = gr.Textbox(
+ label=i18n("RVC模型路径"), value="", interactive=True
+ )
+ with gr.Row():
+ onnx_dir = gr.Textbox(
+ label=i18n("Onnx输出路径"), value="", interactive=True
+ )
+ with gr.Row():
+ infoOnnx = gr.Label(label="info")
+ with gr.Row():
+ butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary")
+ butOnnx.click(
+ export_onnx, [ckpt_dir, onnx_dir], infoOnnx, api_name="export_onnx"
+ )
+
+ tab_faq = i18n("常见问题解答")
+ with gr.TabItem(tab_faq):
+ try:
+ if tab_faq == "常见问题解答":
+ with open("docs/cn/faq.md", "r", encoding="utf8") as f:
+ info = f.read()
+ else:
+ with open("docs/en/faq_en.md", "r", encoding="utf8") as f:
+ info = f.read()
+ gr.Markdown(value=info)
+ except:
+ gr.Markdown(traceback.format_exc())
+
+ if config.iscolab:
+ app.queue(concurrency_count=511, max_size=1022).launch(share=True)
+ else:
+ app.queue(concurrency_count=511, max_size=1022).launch(
+ server_name="0.0.0.0",
+ inbrowser=not config.noautoopen,
+ server_port=config.listen_port,
+ quiet=True,
+ )
\ No newline at end of file
diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py
index 765c54c61..8184ca004 100644
--- a/infer/lib/train/utils.py
+++ b/infer/lib/train/utils.py
@@ -235,8 +235,9 @@ def plot_spectrogram_to_numpy(spectrogram):
plt.tight_layout()
fig.canvas.draw()
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ # Fix for newer matplotlib versions
+ buf = fig.canvas.buffer_rgba()
+ data = np.asarray(buf)[:, :, :3]
plt.close()
return data
@@ -266,8 +267,9 @@ def plot_alignment_to_numpy(alignment, info=None):
plt.tight_layout()
fig.canvas.draw()
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ # Fix for newer matplotlib versions
+ buf = fig.canvas.buffer_rgba()
+ data = np.asarray(buf)[:, :, :3]
plt.close()
return data
diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py
index 96a69dee4..1f9d725b9 100644
--- a/infer/modules/train/extract_feature_print.py
+++ b/infer/modules/train/extract_feature_print.py
@@ -86,6 +86,15 @@ def readwave(wav_path, normalize=False):
% model_path
)
exit(0)
+
+# Fix for PyTorch 2.6+ weights_only default change
+import torch.serialization
+try:
+ import fairseq.data.dictionary
+ torch.serialization.add_safe_globals([fairseq.data.dictionary.Dictionary])
+except:
+ pass
+
models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py
index 38a567828..aefdd0349 100644
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@@ -18,6 +18,22 @@
import torch
+# Device detection for MPS (Apple Silicon), CUDA, or CPU
+USE_MPS = False
+USE_CUDA = False
+DEVICE = "cpu"
+
+if torch.backends.mps.is_available():
+ USE_MPS = True
+ DEVICE = "mps"
+ print("Using Apple Silicon MPS GPU acceleration")
+elif torch.cuda.is_available():
+ USE_CUDA = True
+ DEVICE = "cuda"
+ print("Using NVIDIA CUDA GPU acceleration")
+else:
+ print("No GPU detected, using CPU")
+
try:
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
@@ -79,6 +95,15 @@
global_step = 0
+def to_device(tensor, rank=0):
+ """Move tensor to appropriate device (MPS, CUDA, or CPU)"""
+ if USE_MPS:
+ return tensor.to("mps")
+ elif USE_CUDA:
+ return tensor.cuda(rank, non_blocking=True)
+ return tensor
+
+
class EpochRecorder:
def __init__(self):
self.last_time = ttime()
@@ -167,7 +192,7 @@ def run(rank, n_gpus, hps, logger: logging.Logger):
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
- is_half=hps.train.fp16_run,
+ is_half=hps.train.fp16_run and not USE_MPS, # MPS doesn't support fp16 well
sr=hps.sample_rate,
)
else:
@@ -175,12 +200,17 @@ def run(rank, n_gpus, hps, logger: logging.Logger):
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
- is_half=hps.train.fp16_run,
+ is_half=hps.train.fp16_run and not USE_MPS,
)
- if torch.cuda.is_available():
+ # Move models to device
+ if USE_MPS:
+ net_g = net_g.to("mps")
+ elif USE_CUDA:
net_g = net_g.cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
- if torch.cuda.is_available():
+ if USE_MPS:
+ net_d = net_d.to("mps")
+ elif USE_CUDA:
net_d = net_d.cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
@@ -194,16 +224,16 @@ def run(rank, n_gpus, hps, logger: logging.Logger):
betas=hps.train.betas,
eps=hps.train.eps,
)
- # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
- # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+ # DDP wrapping (not used for MPS single-GPU)
if hasattr(torch, "xpu") and torch.xpu.is_available():
pass
- elif torch.cuda.is_available():
+ elif USE_CUDA:
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
- else:
+ elif not USE_MPS:
net_g = DDP(net_g)
net_d = DDP(net_d)
+ # MPS: no DDP needed for single GPU
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
@@ -260,7 +290,8 @@ def run(rank, n_gpus, hps, logger: logging.Logger):
optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
)
- scaler = GradScaler(enabled=hps.train.fp16_run)
+ # MPS doesn't support fp16 GradScaler well
+ scaler = GradScaler(enabled=hps.train.fp16_run and not USE_MPS)
cache = []
for epoch in range(epoch_str, hps.train.epochs + 1):
@@ -341,18 +372,18 @@ def train_and_evaluate(
wave_lengths,
sid,
) = info
- # Load on CUDA
- if torch.cuda.is_available():
- phone = phone.cuda(rank, non_blocking=True)
- phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
+ # Load on GPU (CUDA or MPS)
+ if USE_CUDA or USE_MPS:
+ phone = to_device(phone, rank)
+ phone_lengths = to_device(phone_lengths, rank)
if hps.if_f0 == 1:
- pitch = pitch.cuda(rank, non_blocking=True)
- pitchf = pitchf.cuda(rank, non_blocking=True)
- sid = sid.cuda(rank, non_blocking=True)
- spec = spec.cuda(rank, non_blocking=True)
- spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
- wave = wave.cuda(rank, non_blocking=True)
- wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
+ pitch = to_device(pitch, rank)
+ pitchf = to_device(pitchf, rank)
+ sid = to_device(sid, rank)
+ spec = to_device(spec, rank)
+ spec_lengths = to_device(spec_lengths, rank)
+ wave = to_device(wave, rank)
+ wave_lengths = to_device(wave_lengths, rank)
# Cache on list
if hps.if_f0 == 1:
cache.append(
@@ -412,21 +443,20 @@ def train_and_evaluate(
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
- ## Load on CUDA
- if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available():
- phone = phone.cuda(rank, non_blocking=True)
- phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
+ ## Load on GPU (CUDA or MPS)
+ if (hps.if_cache_data_in_gpu == False) and (USE_CUDA or USE_MPS):
+ phone = to_device(phone, rank)
+ phone_lengths = to_device(phone_lengths, rank)
if hps.if_f0 == 1:
- pitch = pitch.cuda(rank, non_blocking=True)
- pitchf = pitchf.cuda(rank, non_blocking=True)
- sid = sid.cuda(rank, non_blocking=True)
- spec = spec.cuda(rank, non_blocking=True)
- spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
- wave = wave.cuda(rank, non_blocking=True)
- # wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
+ pitch = to_device(pitch, rank)
+ pitchf = to_device(pitchf, rank)
+ sid = to_device(sid, rank)
+ spec = to_device(spec, rank)
+ spec_lengths = to_device(spec_lengths, rank)
+ wave = to_device(wave, rank)
# Calculate
- with autocast(enabled=hps.train.fp16_run):
+ with autocast(enabled=hps.train.fp16_run and not USE_MPS):
if hps.if_f0 == 1:
(
y_hat,
diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py
index c128707cf..1e4afde83 100644
--- a/infer/modules/vc/utils.py
+++ b/infer/modules/vc/utils.py
@@ -1,7 +1,17 @@
import os
+import torch
from fairseq import checkpoint_utils
+# PyTorch 2.6+ compatibility: weights_only=True by default breaks fairseq loading
+# Monkey-patch torch.load to use weights_only=False for model loading
+_original_torch_load = torch.load
+def _patched_torch_load(*args, **kwargs):
+ if 'weights_only' not in kwargs:
+ kwargs['weights_only'] = False
+ return _original_torch_load(*args, **kwargs)
+torch.load = _patched_torch_load
+
def get_index_path_from_model(sid):
return next(
diff --git a/inference_log.txt b/inference_log.txt
new file mode 100644
index 000000000..2b53e1c21
--- /dev/null
+++ b/inference_log.txt
@@ -0,0 +1,24 @@
+/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/pyworld/__init__.py:13: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+ import pkg_resources
+2025-12-08 01:10:34 | INFO | configs.config | No supported Nvidia GPU found
+2025-12-08 01:10:34 | INFO | configs.config | MPS available but using CPU for stability
+2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/32k.json
+2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/40k.json
+2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/48k.json
+2025-12-08 01:10:34 | INFO | configs.config | overwrite v2/48k.json
+2025-12-08 01:10:34 | INFO | configs.config | overwrite v2/32k.json
+2025-12-08 01:10:34 | INFO | configs.config | overwrite preprocess_per to 3
+2025-12-08 01:10:34 | INFO | configs.config | Use cpu instead
+2025-12-08 01:10:34 | INFO | configs.config | Half-precision floating-point: False, device: cpu
+2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Get sid: Voice_New.pth
+2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Loading: assets/weights/Voice_New.pth
+/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
+ WeightNorm.apply(module, name, dim)
+2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Select index: logs/Voice_New/added_IVF86_Flat_nprobe_1.index
+2025-12-08 01:10:34 | INFO | fairseq.tasks.hubert_pretraining | current directory is /Users/arunkumarv/Music/Voice Clone/rvc-webui
+2025-12-08 01:10:34 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
+2025-12-08 01:10:34 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}
+/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
+ WeightNorm.apply(module, name, dim)
+/opt/anaconda3/envs/rvc/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
+ warnings.warn('resource_tracker: There appear to be %d '
diff --git a/run_inference.py b/run_inference.py
new file mode 100644
index 000000000..f77debf23
--- /dev/null
+++ b/run_inference.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+
+import torch
+import faiss
+faiss.omp_set_num_threads(1)
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from dotenv import load_dotenv
+load_dotenv()
+
+os.environ['weight_root'] = 'assets/weights'
+os.environ['index_root'] = 'logs'
+os.environ['rmvpe_root'] = 'assets/rmvpe'
+
+print(f"PyTorch version: {torch.__version__}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"MPS available: {torch.backends.mps.is_available() if hasattr(torch.backends, 'mps') else False}")
+
+from configs.config import Config
+config = Config()
+print(f"Device selected by config: {config.device}")
+
+from infer.modules.vc.modules import VC
+vc_instance = VC(config)
+
+model_name = "Voice_New.pth"
+input_audio = "/Users/arunkumarv/Music/Voice Clone/Voice_convert.mp3"
+output_audio = "/Users/arunkumarv/Music/Voice Clone/rvc-webui/output/Voice_New/converted.wav"
+
+os.makedirs(os.path.dirname(output_audio), exist_ok=True)
+
+print(f"\nLoading model: {model_name}")
+vc_instance.get_vc(model_name)
+
+print(f"Converting audio: {input_audio}")
+print(f"Output will be saved to: {output_audio}")
+
+print("Starting vc_single...")
+sys.stdout.flush()
+
+import soundfile as sf
+
+try:
+ result_message, audio_result = vc_instance.vc_single(
+ sid=0,
+ input_audio_path=input_audio,
+ f0_up_key=0,
+ f0_file=None,
+ f0_method="rmvpe",
+ file_index=f"logs/Voice_New/added_IVF86_Flat_nprobe_1.index",
+ file_index2="",
+ index_rate=0.75,
+ filter_radius=3,
+ resample_sr=0,
+ rms_mix_rate=0.25,
+ protect=0.33
+ )
+
+ print(f"\nResult: {result_message}")
+
+ sample_rate, audio_data = audio_result
+ if audio_data is not None and sample_rate is not None:
+ sf.write(output_audio, audio_data, sample_rate)
+ print(f"✓ Audio saved successfully to: {output_audio}")
+ else:
+ print("✗ Conversion failed!")
+ sys.exit(1)
+except Exception as e:
+ import traceback
+ print(f"Error: {e}")
+ traceback.print_exc()
+ sys.exit(1)
diff --git a/run_inference_api.py b/run_inference_api.py
new file mode 100644
index 000000000..e1fcfc7ad
--- /dev/null
+++ b/run_inference_api.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+from gradio_client import Client
+import os
+
+client = Client("http://localhost:7865")
+
+input_audio = "/Users/arunkumarv/Music/Voice Clone/Voice_convert.mp3"
+output_dir = "/Users/arunkumarv/Music/Voice Clone/rvc-webui/output/Voice_New"
+os.makedirs(output_dir, exist_ok=True)
+
+print(f"Submitting inference request...")
+print(f"Input: {input_audio}")
+print(f"Model: Voice_New.pth")
+print(f"F0 Method: pm")
+
+result = client.predict(
+ spk_item="Voice_New.pth",
+ input_audio0=input_audio,
+ vc_transform0=0, # pitch shift
+ f0_file=None,
+ f0method0="pm", # F0 method
+ file_index1="", # manual index path
+ file_index2="logs/Voice_New/added_IVF86_Flat_nprobe_1.index", # dropdown selection
+ index_rate1=0.75, # retrieval mix
+ filter_radius0=3, # median filter
+ resample_sr0=0, # output sample rate
+ rms_mix_rate0=0.25, # volume envelope
+ protect0=0.33, # consonant protection
+ api_name="/infer_convert"
+)
+
+output_message, output_audio_tuple = result
+print(f"\nResult: {output_message}")
+
+if output_audio_tuple and len(output_audio_tuple) > 1:
+ output_path = os.path.join(output_dir, "converted.wav")
+ # Gradio returns the audio file path
+ if isinstance(output_audio_tuple, tuple) and len(output_audio_tuple) == 2:
+ sr, audio_file = output_audio_tuple
+ print(f"✓ Audio converted successfully!")
+ print(f"Sample rate: {sr} Hz")
+ print(f"Output: {output_path}")
+ else:
+ print(f"Unexpected output format: {output_audio_tuple}")
+else:
+ print("✗ Conversion failed!")
diff --git a/setup-doc.md b/setup-doc.md
new file mode 100644
index 000000000..153f17980
--- /dev/null
+++ b/setup-doc.md
@@ -0,0 +1,67 @@
+# RVC WebUI Setup Documentation
+
+## Environment
+
+- macOS (Apple Silicon)
+- Python 3.10 (Conda)
+- PyTorch with MPS support
+
+## Problems and Solutions
+
+### 1. fairseq Installation Failure
+
+**Error**: `omegaconf` metadata parsing error with pip 25.x
+
+**Fix**:
+```bash
+pip install "pip<24.1"
+pip install fairseq==0.12.2
+```
+
+### 2. Gradio Version Mismatch
+
+**Error**: `concurrency_count` parameter not recognized (wrong gradio version installed)
+
+**Fix**:
+```bash
+pip install gradio==3.34.0
+```
+
+### 3. gradio_client.serializing Module Not Found
+
+**Error**: `ModuleNotFoundError: No module named 'gradio_client.serializing'`
+
+**Cause**: Modern gradio-client (2.x) removed the serializing module that gradio 3.34.0 expects
+
+**Fix**:
+```bash
+pip install gradio-client==0.2.7
+```
+
+## Working Installation Sequence
+
+```bash
+conda create -n rvc python=3.10 -y
+conda activate rvc
+
+pip install "pip<24.1"
+pip install torch torchvision torchaudio
+pip install -r requirements.txt
+pip install fairseq==0.12.2
+pip install gradio==3.34.0
+pip install gradio-client==0.2.7
+
+python tools/download_models.py
+python infer-web.py
+```
+
+## Verification
+
+```bash
+python -c "import torch; print(torch.backends.mps.is_available())"
+python -c "from gradio_client.serializing import Serializable; print('OK')"
+```
+
+## Access
+
+WebUI runs at: http://localhost:7865
diff --git a/temp_download_dialects.py b/temp_download_dialects.py
new file mode 100644
index 000000000..c93b14491
--- /dev/null
+++ b/temp_download_dialects.py
@@ -0,0 +1,33 @@
+
+import os
+from datasets import load_dataset
+from pathlib import Path
+import soundfile as sf
+
+print("Loading English Dialects dataset...")
+ds = load_dataset("ylacombe/english_dialects", split="train", streaming=True)
+
+datasets_dir = Path("datasets/accent_non_native")
+datasets_dir.mkdir(parents=True, exist_ok=True)
+
+count = 0
+max_samples = 150
+
+# Target dialects for "non-native" feel (regional accents)
+target_dialects = ["scottish", "irish", "welsh", "northern"]
+
+for sample in ds:
+ dialect = sample.get("dialect", "").lower()
+
+ if any(d in dialect for d in target_dialects) and count < max_samples:
+ audio = sample["audio"]
+ out_path = datasets_dir / f"{dialect}_{count}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ count += 1
+ print(f"Saved {out_path.name} (total: {count})")
+
+ if count >= max_samples:
+ break
+
+print(f"Downloaded {count} dialect samples")
diff --git a/temp_download_genshin.py b/temp_download_genshin.py
new file mode 100644
index 000000000..27e2e38eb
--- /dev/null
+++ b/temp_download_genshin.py
@@ -0,0 +1,38 @@
+
+import os
+from datasets import load_dataset
+from pathlib import Path
+import soundfile as sf
+
+print("Loading Genshin Voice dataset...")
+ds = load_dataset("simon3000/genshin-voice", split="train", streaming=True)
+
+datasets_dir = Path("datasets/anime_airy")
+datasets_dir.mkdir(parents=True, exist_ok=True)
+
+count = 0
+max_samples = 150
+
+# Target characters with airy/cute voices (English)
+target_chars = ["paimon", "barbara", "kokomi", "nahida", "klee", "qiqi", "diona"]
+
+for sample in ds:
+ speaker = str(sample.get("speaker", "")).lower()
+ lang = sample.get("language", "")
+
+ # Only English samples
+ if lang != "en":
+ continue
+
+ if any(char in speaker for char in target_chars) and count < max_samples:
+ audio = sample["audio"]
+ out_path = datasets_dir / f"{speaker.replace(' ', '_')}_{count}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ count += 1
+ print(f"Saved {out_path.name} (total: {count})")
+
+ if count >= max_samples:
+ break
+
+print(f"Downloaded {count} anime voice samples")
diff --git a/temp_download_hifi.py b/temp_download_hifi.py
new file mode 100644
index 000000000..7724e5af6
--- /dev/null
+++ b/temp_download_hifi.py
@@ -0,0 +1,46 @@
+
+import os
+from datasets import load_dataset
+from pathlib import Path
+import soundfile as sf
+
+print("Loading Hi-Fi TTS dataset...")
+ds = load_dataset("MikhailT/hifi-tts", split="train", streaming=True)
+
+datasets_dir = Path("datasets")
+
+# Speaker ID to voice type mapping for HiFi-TTS
+# HiFi has 10 speakers total
+voice_map = {
+ "92": "male_low", # Deep male
+ "6097": "male_mid", # Mid male
+ "6670": "female_low", # Lower female
+ "6671": "female_high", # Higher female
+ "8051": "singing_male",
+ "9017": "singing_female",
+}
+
+counts = {k: 0 for k in set(voice_map.values())}
+max_per_type = 100
+
+for sample in ds:
+ speaker = str(sample.get("speaker", ""))
+
+ if speaker in voice_map:
+ voice_type = voice_map[speaker]
+
+ if counts[voice_type] < max_per_type:
+ out_dir = datasets_dir / voice_type
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ audio = sample["audio"]
+ out_path = out_dir / f"hifi_{speaker}_{counts[voice_type]}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ counts[voice_type] += 1
+ print(f"Saved {out_path.name} ({voice_type}: {counts[voice_type]})")
+
+ if all(c >= max_per_type for c in counts.values()):
+ break
+
+print(f"Final counts: {counts}")
diff --git a/temp_download_libritts.py b/temp_download_libritts.py
new file mode 100644
index 000000000..b8d7dd48f
--- /dev/null
+++ b/temp_download_libritts.py
@@ -0,0 +1,45 @@
+
+import os
+from datasets import load_dataset
+from pathlib import Path
+
+print("Loading LibriTTS dataset (this may take a while)...")
+ds = load_dataset("mythicinfinity/libritts", "clean", split="train.clean.100", streaming=True)
+
+# Sample speakers - take first 50 samples per target voice type
+# LibriTTS speaker IDs are in the 'speaker_id' column
+target_speakers = {
+ "male_low": ["19", "26", "1272"], # Deep male voices
+ "male_mid": ["32", "40", "1089"], # Mid-range male
+ "female_low": ["87", "103", "1284"], # Lower female
+ "female_high": ["121", "237", "3570"], # Higher female
+}
+
+datasets_dir = Path("datasets")
+counts = {k: 0 for k in target_speakers}
+max_per_type = 100 # Max samples per voice type
+
+for sample in ds:
+ speaker = str(sample.get("speaker_id", ""))
+
+ for voice_type, speakers in target_speakers.items():
+ if speaker in speakers and counts[voice_type] < max_per_type:
+ out_dir = datasets_dir / voice_type
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ audio = sample["audio"]
+ out_path = out_dir / f"{speaker}_{sample['id']}.wav"
+
+ # Save audio
+ import soundfile as sf
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+
+ counts[voice_type] += 1
+ print(f"Saved {out_path.name} ({voice_type}: {counts[voice_type]})")
+
+ # Check if we have enough
+ if all(c >= max_per_type for c in counts.values()):
+ print("Collected enough samples!")
+ break
+
+print(f"Final counts: {counts}")
diff --git a/tools/audio_preprocessor.py b/tools/audio_preprocessor.py
new file mode 100644
index 000000000..7e564169a
--- /dev/null
+++ b/tools/audio_preprocessor.py
@@ -0,0 +1,81 @@
+import os
+import argparse
+import librosa
+import soundfile as sf
+import numpy as np
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from tqdm import tqdm
+
+def process_audio(input_path, output_dir, sr=40000, min_silence_len=500, silence_thresh=-40, chunk_len=10000):
+ """
+ Process audio file: convert to wav, normalize, remove silence, split into chunks
+ """
+ filename = os.path.basename(input_path).split('.')[0]
+
+ print(f"Processing {input_path}...")
+
+ # Load audio
+ try:
+ audio = AudioSegment.from_file(input_path)
+ except Exception as e:
+ print(f"Error loading {input_path}: {e}")
+ return
+
+ # Normalize
+ audio = audio.normalize()
+
+ # Split on silence
+ chunks = split_on_silence(
+ audio,
+ min_silence_len=min_silence_len,
+ silence_thresh=silence_thresh,
+ keep_silence=100
+ )
+
+ # Combine small chunks to reach target length
+ output_chunks = []
+ current_chunk = AudioSegment.empty()
+
+ for chunk in chunks:
+ if len(current_chunk) + len(chunk) < chunk_len:
+ current_chunk += chunk
+ else:
+ output_chunks.append(current_chunk)
+ current_chunk = chunk
+
+ if len(current_chunk) > 0:
+ output_chunks.append(current_chunk)
+
+ # Save chunks
+ os.makedirs(output_dir, exist_ok=True)
+
+ for i, chunk in enumerate(output_chunks):
+ # Convert to target sample rate
+ chunk = chunk.set_frame_rate(sr).set_channels(1)
+
+ # Export
+ out_name = f"{filename}_{i:03d}.wav"
+ out_path = os.path.join(output_dir, out_name)
+ chunk.export(out_path, format="wav")
+
+ print(f"Saved {len(output_chunks)} chunks to {output_dir}")
+
+def main():
+ parser = argparse.ArgumentParser(description="Audio Dataset Preprocessor for RVC")
+ parser.add_argument("--input", "-i", required=True, help="Input file or directory")
+ parser.add_argument("--output", "-o", required=True, help="Output directory")
+ parser.add_argument("--sr", type=int, default=40000, help="Target sample rate (default: 40000)")
+ parser.add_argument("--len", type=int, default=10000, help="Target chunk length in ms (default: 10000)")
+
+ args = parser.parse_args()
+
+ if os.path.isfile(args.input):
+ process_audio(args.input, args.output, sr=args.sr, chunk_len=args.len)
+ elif os.path.isdir(args.input):
+ files = [f for f in os.listdir(args.input) if f.lower().endswith(('.wav', '.mp3', '.flac', '.m4a', '.ogg'))]
+ for f in tqdm(files):
+ process_audio(os.path.join(args.input, f), args.output, sr=args.sr, chunk_len=args.len)
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/download_datasets.py b/tools/download_datasets.py
new file mode 100644
index 000000000..85407ea82
--- /dev/null
+++ b/tools/download_datasets.py
@@ -0,0 +1,246 @@
+"""
+Simplified dataset downloader - downloads voice data directly.
+"""
+import os
+import sys
+from pathlib import Path
+
+# Ensure we have datasets library
+try:
+ from datasets import load_dataset
+ import soundfile as sf
+except ImportError:
+ print("Installing required packages...")
+ os.system("pip install datasets soundfile")
+ from datasets import load_dataset
+ import soundfile as sf
+
+# Setup paths
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+DATASETS_DIR = PROJECT_ROOT / "datasets"
+
+def download_libritts():
+ """Download LibriTTS samples for male/female voices."""
+ print("\n=== Downloading LibriTTS (Male/Female Voices) ===")
+
+ ds = load_dataset("mythicinfinity/libritts", "clean", split="train.clean.100", streaming=True)
+
+ # Speaker ID to voice type mapping
+ speaker_voice_map = {
+ "19": "male_low",
+ "26": "male_low",
+ "1272": "male_low",
+ "32": "male_mid",
+ "40": "male_mid",
+ "1089": "male_mid",
+ "87": "female_low",
+ "103": "female_low",
+ "1284": "female_low",
+ "121": "female_high",
+ "237": "female_high",
+ "3570": "female_high",
+ }
+
+ counts = {}
+ for vt in set(speaker_voice_map.values()):
+ counts[vt] = 0
+ (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True)
+
+ max_per_type = 100
+
+ for sample in ds:
+ speaker = str(sample.get("speaker_id", ""))
+
+ if speaker in speaker_voice_map:
+ voice_type = speaker_voice_map[speaker]
+
+ if counts[voice_type] < max_per_type:
+ out_dir = DATASETS_DIR / voice_type
+ audio = sample["audio"]
+ out_path = out_dir / f"libritts_{speaker}_{counts[voice_type]}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ counts[voice_type] += 1
+ print(f" {voice_type}: {counts[voice_type]}/{max_per_type}", end="\r")
+
+ if all(c >= max_per_type for c in counts.values()):
+ break
+
+ print(f"\nLibriTTS complete: {counts}")
+
+def download_dialects():
+ """Download English dialect samples."""
+ print("\n=== Downloading English Dialects (Accents) ===")
+
+ out_dir = DATASETS_DIR / "accent_non_native"
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ count = 0
+ max_samples = 150
+ configs = ["scottish_male", "scottish_female", "irish_male", "welsh_male", "welsh_female", "northern_male"]
+
+ for config in configs:
+ if count >= max_samples:
+ break
+ try:
+ ds = load_dataset("ylacombe/english_dialects", config, split="train", streaming=True)
+ for sample in ds:
+ if count >= max_samples:
+ break
+ audio = sample["audio"]
+ out_path = out_dir / f"dialect_{config}_{count}.wav"
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ count += 1
+ print(f" accent_non_native: {count}/{max_samples}", end="\r")
+ except Exception as e:
+ print(f" Error with {config}: {e}")
+
+ print(f"\nDialects complete: {count} samples")
+
+def download_genshin():
+ """Download Genshin voices for anime style."""
+ print("\n=== Downloading Genshin Voices (Anime Style) ===")
+
+ ds = load_dataset("simon3000/genshin-voice", split="train", streaming=True)
+
+ out_dir = DATASETS_DIR / "anime_airy"
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ count = 0
+ max_samples = 150
+ target_chars = ["paimon", "barbara", "kokomi", "nahida", "klee", "qiqi", "diona"]
+
+ for sample in ds:
+ speaker = str(sample.get("speaker", "")).lower()
+ lang = sample.get("language", "")
+
+ if lang != "en":
+ continue
+
+ if any(char in speaker for char in target_chars) and count < max_samples:
+ audio = sample["audio"]
+ clean_speaker = speaker.replace(" ", "_").replace("/", "_")
+ out_path = out_dir / f"genshin_{clean_speaker}_{count}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ count += 1
+ print(f" anime_airy: {count}/{max_samples}", end="\r")
+
+ if count >= max_samples:
+ break
+
+ print(f"\nGenshin complete: {count} samples")
+
+def download_hifi():
+ """Download Hi-Fi TTS for singing voices."""
+ print("\n=== Downloading Hi-Fi TTS (Singing/High Quality) ===")
+
+ ds = load_dataset("MikhailT/hifi-tts", "clean", split="train", streaming=True)
+
+ # Map speakers to voice types
+ speaker_map = {
+ "92": "singing_male",
+ "6097": "singing_male",
+ "6670": "singing_female",
+ "6671": "singing_female",
+ }
+
+ counts = {}
+ for vt in set(speaker_map.values()):
+ counts[vt] = 0
+ (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True)
+
+ max_per_type = 100
+
+ for sample in ds:
+ speaker = str(sample.get("speaker", ""))
+
+ if speaker in speaker_map:
+ voice_type = speaker_map[speaker]
+
+ if counts[voice_type] < max_per_type:
+ out_dir = DATASETS_DIR / voice_type
+ audio = sample["audio"]
+ out_path = out_dir / f"hifi_{speaker}_{counts[voice_type]}.wav"
+
+ sf.write(str(out_path), audio["array"], audio["sampling_rate"])
+ counts[voice_type] += 1
+ print(f" {voice_type}: {counts[voice_type]}/{max_per_type}", end="\r")
+
+ if all(c >= max_per_type for c in counts.values()):
+ break
+
+ print(f"\nHi-Fi TTS complete: {counts}")
+
+def print_summary():
+ """Print download summary."""
+ print("\n" + "=" * 50)
+ print("DOWNLOAD SUMMARY")
+ print("=" * 50)
+
+ voice_types = [
+ "male_low", "male_mid", "female_low", "female_high",
+ "anime_airy", "accent_non_native", "singing_male", "singing_female",
+ "child", "elderly"
+ ]
+
+ total = 0
+ for vt in voice_types:
+ vt_dir = DATASETS_DIR / vt
+ if vt_dir.exists():
+ files = list(vt_dir.glob("*.wav"))
+ count = len(files)
+ total += count
+ status = "✓" if count > 0 else "✗"
+ print(f" {status} {vt}: {count} files")
+ else:
+ print(f" ✗ {vt}: 0 files")
+
+ print(f"\nTotal: {total} audio files downloaded")
+ print("\nNote: 'child' and 'elderly' need manual data - not available in these datasets.")
+
+def main():
+ print("=" * 50)
+ print("RVC Voice Dataset Downloader")
+ print("=" * 50)
+ print(f"Output: {DATASETS_DIR}")
+
+ # Create all directories
+ for vt in ["male_low", "male_mid", "female_low", "female_high",
+ "anime_airy", "accent_non_native", "singing_male", "singing_female",
+ "child", "elderly"]:
+ (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True)
+
+ # Download each dataset
+ try:
+ download_libritts()
+ except Exception as e:
+ print(f"Error downloading LibriTTS: {e}")
+
+ try:
+ download_dialects()
+ except Exception as e:
+ print(f"Error downloading Dialects: {e}")
+
+ try:
+ download_genshin()
+ except Exception as e:
+ print(f"Error downloading Genshin: {e}")
+
+ try:
+ download_hifi()
+ except Exception as e:
+ print(f"Error downloading Hi-Fi TTS: {e}")
+
+ print_summary()
+
+ print("\n" + "=" * 50)
+ print("NEXT STEPS")
+ print("=" * 50)
+ print("1. Review downloaded files in datasets/ folder")
+ print("2. Train models: python tools/train_batch.py --voice male_low")
+ print("3. Run experiments: python tools/run_experiments_batch.py")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/run_experiments_batch.py b/tools/run_experiments_batch.py
new file mode 100644
index 000000000..b1cd65b26
--- /dev/null
+++ b/tools/run_experiments_batch.py
@@ -0,0 +1,93 @@
+import os
+import subprocess
+import sys
+
+# Add root to path
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+def main():
+ # List of voices as defined in Task 7/10
+ voices = [
+ 'male_low', 'male_mid', 'female_low', 'female_high',
+ 'anime_airy', 'accent_non_native', 'singing_male', 'singing_female',
+ 'child', 'elderly'
+ ]
+
+ # Base paths
+ weights_dir = os.path.join(now_dir, "assets", "weights")
+ datasets_dir = os.path.join(now_dir, "datasets")
+ experiments_dir = os.path.join(now_dir, "experiments")
+
+ # Path to test_grid.py
+ test_grid_script = os.path.join(now_dir, "tools", "test_grid.py")
+
+ print(f"Starting Batch Experiments for {len(voices)} voices...")
+
+ for voice in voices:
+ print(f"\n--- Processing Voice: {voice} ---")
+
+ # Check for model
+ model_name = f"{voice}.pth"
+ model_path = os.path.join(weights_dir, model_name)
+
+ if not os.path.exists(model_path):
+ print(f"Skipping {voice}: Model not found at {model_path}")
+ continue
+
+ # Check for test audio
+ # We need a reference audio to run inference on.
+ # Ideally, we should have a 'test_samples' folder or use a file from the dataset itself (held out).
+ # For now, let's look for a 'test.wav' in the voice's dataset folder, or a global test file.
+
+ # Strategy: Look for 'test.wav' in dataset dir, else take the first wav file found.
+ voice_dataset_dir = os.path.join(datasets_dir, voice)
+ input_audio = None
+
+ if os.path.exists(voice_dataset_dir):
+ potential_files = [f for f in os.listdir(voice_dataset_dir) if f.endswith(".wav")]
+ if "test.wav" in potential_files:
+ input_audio = os.path.join(voice_dataset_dir, "test.wav")
+ elif len(potential_files) > 0:
+ input_audio = os.path.join(voice_dataset_dir, potential_files[0])
+
+ if not input_audio:
+ print(f"Skipping {voice}: No input audio found in {voice_dataset_dir}")
+ continue
+
+ # Check for index file (optional but recommended)
+ # Usually located in logs/{voice}/added_*.index
+ # We need to find it.
+ logs_dir = os.path.join(now_dir, "logs", voice)
+ index_path = ""
+ if os.path.exists(logs_dir):
+ for f in os.listdir(logs_dir):
+ if f.startswith("added_") and f.endswith(".index"):
+ index_path = os.path.join(logs_dir, f)
+ break
+
+ print(f"Model: {model_name}")
+ print(f"Input: {input_audio}")
+ print(f"Index: {index_path if index_path else 'None'}")
+
+ # Run test_grid.py
+ cmd = [
+ sys.executable, test_grid_script,
+ "--model_name", model_name,
+ "--input_path", input_audio,
+ "--output_dir", experiments_dir
+ ]
+
+ if index_path:
+ cmd.extend(["--index_path", index_path])
+
+ try:
+ subprocess.run(cmd, check=True)
+ print(f"Successfully ran experiments for {voice}")
+ except subprocess.CalledProcessError as e:
+ print(f"Error running experiments for {voice}: {e}")
+
+ print("\nBatch Experiments Completed.")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/test_grid.py b/tools/test_grid.py
new file mode 100644
index 000000000..7f8728289
--- /dev/null
+++ b/tools/test_grid.py
@@ -0,0 +1,131 @@
+import itertools
+import argparse
+import os
+import sys
+import json
+import time
+from scipy.io import wavfile
+
+# Add root to path
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from dotenv import load_dotenv
+from configs.config import Config
+from infer.modules.vc.modules import VC
+
+def main():
+ parser = argparse.ArgumentParser(description="Run RVC inference across a parameter grid.")
+ parser.add_argument("--model_name", required=True, help="Name of the model (must be in assets/weights)")
+ parser.add_argument("--input_path", required=True, help="Path to reference audio file")
+ parser.add_argument("--index_path", default="", help="Path to .index file")
+ parser.add_argument("--output_dir", default="experiments", help="Base directory for output")
+ parser.add_argument("--f0up_key", type=int, default=0, help="Pitch shift (semitones)")
+
+ args = parser.parse_args()
+
+ # Load config and VC
+ load_dotenv()
+ config = Config()
+ vc = VC(config)
+ vc.get_vc(args.model_name)
+
+ # Define Grid
+ # You can modify this grid in the code or make it configurable via JSON later
+ grid = {
+ "f0method": ["rmvpe", "pm"], # "harvest", "crepe" are slower
+ "index_rate": [0.0, 0.5, 0.75, 1.0],
+ "filter_radius": [3],
+ "rms_mix_rate": [0.25, 1.0],
+ "protect": [0.33],
+ "resample_sr": [0], # 0 means no resampling
+ }
+
+ # Prepare output directory
+ model_slug = os.path.splitext(args.model_name)[0]
+ audio_slug = os.path.splitext(os.path.basename(args.input_path))[0]
+ timestamp = int(time.time())
+ experiment_dir = os.path.join(args.output_dir, model_slug, audio_slug, str(timestamp))
+ os.makedirs(experiment_dir, exist_ok=True)
+
+ print(f"Starting Grid Search Experiment")
+ print(f"Model: {args.model_name}")
+ print(f"Input: {args.input_path}")
+ print(f"Output: {experiment_dir}")
+
+ # Generate combinations
+ keys = grid.keys()
+ values = grid.values()
+ combinations = list(itertools.product(*values))
+
+ results = []
+
+ total = len(combinations)
+ print(f"Total combinations to run: {total}")
+
+ for i, combo in enumerate(combinations):
+ params = dict(zip(keys, combo))
+ print(f"[{i+1}/{total}] Running with {params}")
+
+ # Construct output filename
+ # e.g. rmvpe_idx0.5_rms1.0.wav
+ filename_parts = [f"{k}{v}" for k, v in params.items()]
+ filename = "_".join(filename_parts) + ".wav"
+ output_path = os.path.join(experiment_dir, filename)
+
+ # Run Inference
+ try:
+ info, opt = vc.vc_single(
+ 0, # sid
+ args.input_path,
+ args.f0up_key,
+ None, # f0_file
+ params["f0method"],
+ args.index_path,
+ None, # file_index2
+ params["index_rate"],
+ params["filter_radius"],
+ params["resample_sr"],
+ params["rms_mix_rate"],
+ params["protect"]
+ )
+
+ if "Success" in info:
+ tgt_sr, audio_opt = opt
+ wavfile.write(output_path, tgt_sr, audio_opt)
+ results.append({
+ "params": params,
+ "output_file": filename,
+ "status": "success"
+ })
+ else:
+ print(f"Error: {info}")
+ results.append({
+ "params": params,
+ "status": "failed",
+ "error": info
+ })
+
+ except Exception as e:
+ print(f"Exception: {e}")
+ results.append({
+ "params": params,
+ "status": "error",
+ "error": str(e)
+ })
+
+ # Save metadata
+ metadata_path = os.path.join(experiment_dir, "metadata.json")
+ with open(metadata_path, "w") as f:
+ json.dump({
+ "model": args.model_name,
+ "input_path": args.input_path,
+ "f0up_key": args.f0up_key,
+ "grid": grid,
+ "results": results
+ }, f, indent=2)
+
+ print(f"Experiment completed. Results saved to {experiment_dir}")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/train_batch.py b/tools/train_batch.py
new file mode 100644
index 000000000..90604efb2
--- /dev/null
+++ b/tools/train_batch.py
@@ -0,0 +1,194 @@
+import os
+import sys
+import time
+import json
+import argparse
+import subprocess
+from pathlib import Path
+
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from configs.config import Config
+
+def run_command(cmd, cwd=None):
+ print(f"Running: {cmd}")
+ process = subprocess.Popen(
+ cmd,
+ shell=True,
+ cwd=cwd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True
+ )
+
+ # Stream output
+ while True:
+ output = process.stdout.readline()
+ if output == '' and process.poll() is not None:
+ break
+ if output:
+ print(output.strip())
+
+ rc = process.poll()
+ return rc
+
+def train_voice_model(voice_name, dataset_path, epochs=50, batch_size=8, sample_rate="40k", version="v2", gpu_id="0"):
+ """
+ Automates the RVC training pipeline for a single voice model.
+ """
+ print(f"\n{'='*50}")
+ print(f"Starting training for: {voice_name}")
+ print(f"{'='*50}\n")
+
+ root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ logs_dir = os.path.join(root_dir, "logs", voice_name)
+
+ # 1. Preprocessing
+ print("\n[Step 1/4] Preprocessing Data...")
+ cmd_preprocess = f"python infer/modules/train/preprocess.py \"{dataset_path}\" {sample_rate.replace('k','000')} 2 \"{logs_dir}\" False 3.0"
+ if run_command(cmd_preprocess, cwd=root_dir) != 0:
+ print("Error in preprocessing")
+ return False
+
+ # 2. Feature Extraction
+ print("\n[Step 2/4] Extracting Features...")
+ # F0 extraction (rmvpe_gpu)
+ cmd_f0 = f"python infer/modules/train/extract/extract_f0_rmvpe.py 1 0 0 \"{logs_dir}\" True"
+ if run_command(cmd_f0, cwd=root_dir) != 0:
+ print("Error in F0 extraction")
+ return False
+
+ # Feature extraction (HuBERT)
+ cmd_feat = f"python infer/modules/train/extract_feature_print.py {gpu_id} 1 0 0 \"{logs_dir}\" {version} False"
+ if run_command(cmd_feat, cwd=root_dir) != 0:
+ print("Error in feature extraction")
+ return False
+
+ # 3. Training Model
+ print("\n[Step 3/4] Training Model...")
+ # Determine pretrained models
+ if version == "v1":
+ pg = f"assets/pretrained/f0G{sample_rate}.pth"
+ pd = f"assets/pretrained/f0D{sample_rate}.pth"
+ else:
+ pg = f"assets/pretrained_v2/f0G{sample_rate}.pth"
+ pd = f"assets/pretrained_v2/f0D{sample_rate}.pth"
+
+ cmd_train = (
+ f"python infer/modules/train/train.py -e \"{voice_name}\" -sr {sample_rate} -f0 1 -bs {batch_size} "
+ f"-g {gpu_id} -te {epochs} -se 10 -pg \"{pg}\" -pd \"{pd}\" -l 0 -c 0 -sw 1 -v {version}"
+ )
+
+ if run_command(cmd_train, cwd=root_dir) != 0:
+ print("Error in training")
+ return False
+
+ # 4. Training Index
+ print("\n[Step 4/4] Training Index...")
+
+ index_script = f"""
+import sys
+import os
+import numpy as np
+import faiss
+from sklearn.cluster import MiniBatchKMeans
+
+exp_dir = "{logs_dir}"
+version = "{version}"
+feature_dir = os.path.join(exp_dir, "3_feature256" if version == "v1" else "3_feature768")
+
+if not os.path.exists(feature_dir):
+ print("Feature dir not found")
+ sys.exit(1)
+
+listdir_res = list(os.listdir(feature_dir))
+if len(listdir_res) == 0:
+ print("No features found")
+ sys.exit(1)
+
+npys = []
+for name in sorted(listdir_res):
+ phone = np.load(os.path.join(feature_dir, name))
+ npys.append(phone)
+
+big_npy = np.concatenate(npys, 0)
+big_npy_idx = np.arange(big_npy.shape[0])
+np.random.shuffle(big_npy_idx)
+big_npy = big_npy[big_npy_idx]
+
+if big_npy.shape[0] > 2e5:
+ big_npy = (
+ MiniBatchKMeans(
+ n_clusters=10000,
+ batch_size=256 * 8,
+ compute_labels=False,
+ init="random",
+ )
+ .fit(big_npy)
+ .cluster_centers_
+ )
+
+np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy)
+n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
+index = faiss.index_factory(256 if version == "v1" else 768, "IVF%s,Flat" % n_ivf)
+index_ivf = faiss.extract_index_ivf(index)
+index_ivf.nprobe = 1
+index.train(big_npy)
+faiss.write_index(
+ index,
+ os.path.join(exp_dir, f"trained_IVF{{n_ivf}}_Flat_nprobe_1_{voice_name}_{version}.index")
+)
+
+batch_size_add = 8192
+for i in range(0, big_npy.shape[0], batch_size_add):
+ index.add(big_npy[i : i + batch_size_add])
+
+faiss.write_index(
+ index,
+ os.path.join(exp_dir, f"added_IVF{{n_ivf}}_Flat_nprobe_1_{voice_name}_{version}.index")
+)
+print("Index training complete")
+"""
+
+ # Write temp script
+ with open("temp_index_train.py", "w") as f:
+ f.write(index_script)
+
+ if run_command("python temp_index_train.py", cwd=root_dir) != 0:
+ print("Error in index training")
+ os.remove("temp_index_train.py")
+ return False
+
+ os.remove("temp_index_train.py")
+ print(f"\nSuccessfully trained model for {voice_name}!")
+ return True
+
+def main():
+ parser = argparse.ArgumentParser(description="Batch Train RVC Models")
+ parser.add_argument("--voice", type=str, help="Specific voice name to train (folder name in datasets/)")
+ parser.add_argument("--epochs", type=int, default=50, help="Number of epochs")
+ parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
+
+ args = parser.parse_args()
+
+ datasets_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "datasets")
+
+ if args.voice:
+ voices = [args.voice]
+ else:
+ voices = [d for d in os.listdir(datasets_dir) if os.path.isdir(os.path.join(datasets_dir, d))]
+
+ print(f"Found {len(voices)} voices to train: {voices}")
+
+ for voice in voices:
+ dataset_path = os.path.join(datasets_dir, voice)
+ # Check if dataset has files
+ if not os.path.exists(dataset_path) or not os.listdir(dataset_path):
+ print(f"Skipping {voice} - no data found")
+ continue
+
+ train_voice_model(voice, dataset_path, epochs=args.epochs, batch_size=args.batch_size)
+
+if __name__ == "__main__":
+ main()