diff --git a/Dockerfile.mac b/Dockerfile.mac new file mode 100644 index 000000000..596ae5679 --- /dev/null +++ b/Dockerfile.mac @@ -0,0 +1,37 @@ +FROM python:3.10-slim + +EXPOSE 7865 + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y -qq ffmpeg aria2 git build-essential && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +COPY requirements.txt . + +RUN pip install --upgrade "pip<24.1" && \ + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \ + pip install --no-cache-dir -r requirements.txt && \ + pip install fairseq==0.12.2 && \ + pip install gradio==3.34.0 gradio-client==0.2.7 + +COPY . . + + +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d assets/pretrained_v2/ -o D40k.pth && \ + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d assets/pretrained_v2/ -o G40k.pth && \ + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d assets/pretrained_v2/ -o f0D40k.pth && \ + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d assets/pretrained_v2/ -o f0G40k.pth + +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d assets/hubert -o hubert_base.pt && \ + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d assets/rmvpe -o rmvpe.pt + +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth" -d assets/uvr5_weights/ -o "HP2-人声vocals+非人声instrumentals.pth" && \ + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth" -d assets/uvr5_weights/ -o "HP5-主旋律人声vocals+其他instrumentals.pth" + +VOLUME [ "/app/weights", "/app/logs", "/app/assets/weights" ] + +CMD ["python", "infer-web.py"] diff --git a/configs/config.py b/configs/config.py index a330fb543..47edad7f5 100644 --- a/configs/config.py +++ b/configs/config.py @@ -167,7 +167,8 @@ def device_config(self) -> tuple: self.preprocess_per = 3.0 elif self.has_mps(): logger.info("No supported Nvidia GPU found") - self.device = self.instead = "mps" + logger.info("MPS available but using CPU for stability") + self.device = self.instead = "cpu" self.is_half = False self.use_fp32_config() else: diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 000000000..554aca12f --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,36 @@ +# Voice Datasets + +This directory contains the audio datasets for training custom RVC models. + +## Structure + +Each subdirectory corresponds to a specific voice type: + +- `male_low/`: Bass/Baritone male voices +- `male_mid/`: Tenor/Mid-range male voices +- `female_low/`: Alto/Contralto female voices +- `female_high/`: Soprano/High-range female voices +- `anime_airy/`: Breath/Airy anime-style voices +- `accent_non_native/`: Voices with distinct non-native accents +- `singing_male/`: Male singing vocals +- `singing_female/`: Female singing vocals +- `child/`: Child voices +- `elderly/`: Elderly voices + +## How to Add Data + +1. **Collect Audio**: Gather 10-15 minutes of clean, single-speaker audio for the desired category. +2. **Place Files**: Put the raw audio files (mp3, wav, etc.) into a temporary folder or directly here. +3. **Process**: Use the provided tool to normalize and split the audio. + +```bash +# Example: Processing a raw file into the male_low dataset +python tools/audio_preprocessor.py -i raw_audio/my_voice.mp3 -o datasets/male_low +``` + +## Requirements + +- **Format**: WAV (will be converted automatically) +- **Sample Rate**: 40kHz or 48kHz (will be converted automatically) +- **Channels**: Mono (will be converted automatically) +- **Quality**: No background noise, music, or reverb. Use UVR5 to clean if necessary. diff --git a/docker-compose.mac.yml b/docker-compose.mac.yml new file mode 100644 index 000000000..94472795d --- /dev/null +++ b/docker-compose.mac.yml @@ -0,0 +1,18 @@ +version: '3.8' + +services: + rvc-webui: + build: + context: . + dockerfile: Dockerfile.mac + ports: + - "7865:7865" + volumes: + - ./weights:/app/weights + - ./logs:/app/logs + - ./assets/weights:/app/assets/weights + - ./datasets:/app/datasets + environment: + - PYTHONUNBUFFERED=1 + restart: unless-stopped + platform: linux/amd64 diff --git a/experiments/output_test.wav b/experiments/output_test.wav new file mode 100644 index 000000000..468ab13b6 Binary files /dev/null and b/experiments/output_test.wav differ diff --git a/experiments/voice1_to_voice2.wav b/experiments/voice1_to_voice2.wav new file mode 100644 index 000000000..8095fa0fb Binary files /dev/null and b/experiments/voice1_to_voice2.wav differ diff --git a/infer-web.py b/infer-web.py index 47596d539..eade72a28 100644 --- a/infer-web.py +++ b/infer-web.py @@ -114,6 +114,11 @@ def forward_dml(ctx, x, scale): if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) default_batch_size = min(mem) // 2 +elif torch.backends.mps.is_available(): + if_gpu_ok = True + gpu_infos.append("0\tApple Silicon MPS") + gpu_info = "Apple Silicon MPS detected" + default_batch_size = 4 else: gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练") default_batch_size = 1 @@ -220,6 +225,14 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w") f.close() + + # Verify trainset_dir exists + if not os.path.exists(trainset_dir): + error_msg = f"Training folder does not exist: {trainset_dir}" + logger.error(error_msg) + yield error_msg + return + cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % ( config.python_cmd, trainset_dir, @@ -231,8 +244,19 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): config.preprocess_per, ) logger.info("Execute: " + cmd) - # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir - p = Popen(cmd, shell=True) + print(f"Starting preprocessing: {cmd}") + # Use shell=False with proper argument list for better reliability + cmd_args = [ + config.python_cmd, + "infer/modules/train/preprocess.py", + trainset_dir, + str(sr), + str(n_p), + f"{now_dir}/logs/{exp_dir}", + str(config.noparallel), + str(config.preprocess_per), + ] + p = Popen(cmd_args, cwd=now_dir) # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] threading.Thread( diff --git a/infer-web.pyi b/infer-web.pyi new file mode 100644 index 000000000..9b6cfe35f --- /dev/null +++ b/infer-web.pyi @@ -0,0 +1,1625 @@ +import os +import sys +from dotenv import load_dotenv + +now_dir = os.getcwd() +sys.path.append(now_dir) +load_dotenv() +from infer.modules.vc.modules import VC +from infer.modules.uvr5.modules import uvr +from infer.lib.train.process_ckpt import ( + change_info, + extract_small_model, + merge, + show_info, +) +from i18n.i18n import I18nAuto +from configs.config import Config +from sklearn.cluster import MiniBatchKMeans +import torch, platform +import numpy as np +import gradio as gr +import faiss +import fairseq +import pathlib +import json +from time import sleep +from subprocess import Popen +from random import shuffle +import warnings +import traceback +import threading +import shutil +import logging + + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("httpx").setLevel(logging.WARNING) + +logger = logging.getLogger(__name__) + +tmp = os.path.join(now_dir, "TEMP") +shutil.rmtree(tmp, ignore_errors=True) +shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True) +shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True) +os.makedirs(tmp, exist_ok=True) +os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) +os.makedirs(os.path.join(now_dir, "assets/weights"), exist_ok=True) +os.environ["TEMP"] = tmp +warnings.filterwarnings("ignore") +torch.manual_seed(114514) + + +config = Config() +vc = VC(config) + + +if config.dml == True: + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml +i18n = I18nAuto() +logger.info(i18n) +# 判断是否有能用来训练和加速推理的N卡 +ngpu = torch.cuda.device_count() +gpu_infos = [] +mem = [] +if_gpu_ok = False + +if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + if any( + value in gpu_name.upper() + for value in [ + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "4060", + "L", + "6000", + ] + ): + # A10#A100#V100#A40#P40#M40#K80#A4500 + if_gpu_ok = True # 至少有一张能用的N卡 + gpu_infos.append("%s\t%s" % (i, gpu_name)) + mem.append( + int( + torch.cuda.get_device_properties(i).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + ) +if if_gpu_ok and len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + default_batch_size = min(mem) // 2 +else: + gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练") + default_batch_size = 1 +gpus = "-".join([i[0] for i in gpu_infos]) + +from gradio.events import Dependency + +class ToolButton(gr.Button, gr.components.FormComponent): + """Small button with single emoji as text, fits inside gradio forms""" + + def __init__(self, **kwargs): + super().__init__(variant="tool", **kwargs) + + def get_block_name(self): + return "button" + from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING + from gradio.blocks import Block + if TYPE_CHECKING: + from gradio.components import Timer + from gradio.components.base import Component + + +weight_root = os.getenv("weight_root") +weight_uvr5_root = os.getenv("weight_uvr5_root") +index_root = os.getenv("index_root") +outside_index_root = os.getenv("outside_index_root") + +names = [] +for name in os.listdir(weight_root): + if name.endswith(".pth"): + names.append(name) +index_paths = [] + + +def lookup_indices(index_root): + global index_paths + for root, dirs, files in os.walk(index_root, topdown=False): + for name in files: + if name.endswith(".index") and "trained" not in name: + index_paths.append("%s/%s" % (root, name)) + + +lookup_indices(index_root) +lookup_indices(outside_index_root) +uvr5_names = [] +for name in os.listdir(weight_uvr5_root): + if name.endswith(".pth") or "onnx" in name: + uvr5_names.append(name.replace(".pth", "")) + + +def change_choices(): + names = [] + for name in os.listdir(weight_root): + if name.endswith(".pth"): + names.append(name) + index_paths = [] + for root, dirs, files in os.walk(index_root, topdown=False): + for name in files: + if name.endswith(".index") and "trained" not in name: + index_paths.append("%s/%s" % (root, name)) + return {"choices": sorted(names), "__type__": "update"}, { + "choices": sorted(index_paths), + "__type__": "update", + } + + +def clean(): + return {"value": "", "__type__": "update"} + + +def export_onnx(ModelPath, ExportedPath): + from infer.modules.onnx.export import export_onnx as eo + + eo(ModelPath, ExportedPath) + + +sr_dict = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +def if_done(done, p): + while 1: + if p.poll() is None: + sleep(0.5) + else: + break + done[0] = True + + +def if_done_multi(done, ps): + while 1: + # poll==None代表进程未结束 + # 只要有一个进程未结束都不停 + flag = 1 + for p in ps: + if p.poll() is None: + flag = 0 + sleep(0.5) + break + if flag == 1: + break + done[0] = True + + +def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): + sr = sr_dict[sr] + os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) + f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w") + f.close() + cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % ( + config.python_cmd, + trainset_dir, + sr, + n_p, + now_dir, + exp_dir, + config.noparallel, + config.preprocess_per, + ) + logger.info("Execute: " + cmd) + # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir + p = Popen(cmd, shell=True) + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done = [False] + threading.Thread( + target=if_done, + args=( + done, + p, + ), + ).start() + while 1: + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: + yield (f.read()) + sleep(1) + if done[0]: + break + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: + log = f.read() + logger.info(log) + yield log + + +# but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2]) +def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvpe): + gpus = gpus.split("-") + os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) + f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w") + f.close() + if if_f0: + if f0method != "rmvpe_gpu": + cmd = ( + '"%s" infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' + % ( + config.python_cmd, + now_dir, + exp_dir, + n_p, + f0method, + ) + ) + logger.info("Execute: " + cmd) + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done = [False] + threading.Thread( + target=if_done, + args=( + done, + p, + ), + ).start() + else: + if gpus_rmvpe != "-": + gpus_rmvpe = gpus_rmvpe.split("-") + leng = len(gpus_rmvpe) + ps = [] + for idx, n_g in enumerate(gpus_rmvpe): + cmd = ( + '"%s" infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' + % ( + config.python_cmd, + leng, + idx, + n_g, + now_dir, + exp_dir, + config.is_half, + ) + ) + logger.info("Execute: " + cmd) + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + ps.append(p) + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done = [False] + threading.Thread( + target=if_done_multi, # + args=( + done, + ps, + ), + ).start() + else: + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" ' + % ( + now_dir, + exp_dir, + ) + ) + logger.info("Execute: " + cmd) + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + p.wait() + done = [True] + while 1: + with open( + "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r" + ) as f: + yield (f.read()) + sleep(1) + if done[0]: + break + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + log = f.read() + logger.info(log) + yield log + # 对不同part分别开多进程 + """ + n_part=int(sys.argv[1]) + i_part=int(sys.argv[2]) + i_gpu=sys.argv[3] + exp_dir=sys.argv[4] + os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) + """ + leng = len(gpus) + ps = [] + for idx, n_g in enumerate(gpus): + cmd = ( + '"%s" infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s %s' + % ( + config.python_cmd, + config.device, + leng, + idx, + n_g, + now_dir, + exp_dir, + version19, + config.is_half, + ) + ) + logger.info("Execute: " + cmd) + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + ps.append(p) + # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done = [False] + threading.Thread( + target=if_done_multi, + args=( + done, + ps, + ), + ).start() + while 1: + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + yield (f.read()) + sleep(1) + if done[0]: + break + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + log = f.read() + logger.info(log) + yield log + + +def get_pretrained_models(path_str, f0_str, sr2): + if_pretrained_generator_exist = os.access( + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + ) + if_pretrained_discriminator_exist = os.access( + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + ) + if not if_pretrained_generator_exist: + logger.warning( + "assets/pretrained%s/%sG%s.pth not exist, will not use pretrained model", + path_str, + f0_str, + sr2, + ) + if not if_pretrained_discriminator_exist: + logger.warning( + "assets/pretrained%s/%sD%s.pth not exist, will not use pretrained model", + path_str, + f0_str, + sr2, + ) + return ( + ( + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + if if_pretrained_generator_exist + else "" + ), + ( + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + if if_pretrained_discriminator_exist + else "" + ), + ) + + +def change_sr2(sr2, if_f0_3, version19): + path_str = "" if version19 == "v1" else "_v2" + f0_str = "f0" if if_f0_3 else "" + return get_pretrained_models(path_str, f0_str, sr2) + + +def change_version19(sr2, if_f0_3, version19): + path_str = "" if version19 == "v1" else "_v2" + if sr2 == "32k" and version19 == "v1": + sr2 = "40k" + to_return_sr2 = ( + {"choices": ["40k", "48k"], "__type__": "update", "value": sr2} + if version19 == "v1" + else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2} + ) + f0_str = "f0" if if_f0_3 else "" + return ( + *get_pretrained_models(path_str, f0_str, sr2), + to_return_sr2, + ) + + +def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 + path_str = "" if version19 == "v1" else "_v2" + return ( + {"visible": if_f0_3, "__type__": "update"}, + {"visible": if_f0_3, "__type__": "update"}, + *get_pretrained_models(path_str, "f0" if if_f0_3 == True else "", sr2), + ) + + +# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) +def click_train( + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, +): + # 生成filelist + exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) + os.makedirs(exp_dir, exist_ok=True) + gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) + feature_dir = ( + "%s/3_feature256" % (exp_dir) + if version19 == "v1" + else "%s/3_feature768" % (exp_dir) + ) + if if_f0_3: + f0_dir = "%s/2a_f0" % (exp_dir) + f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) + names = ( + set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) + & set([name.split(".")[0] for name in os.listdir(feature_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) + ) + else: + names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( + [name.split(".")[0] for name in os.listdir(feature_dir)] + ) + opt = [] + for name in names: + if if_f0_3: + opt.append( + "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + feature_dir.replace("\\", "\\\\"), + name, + f0_dir.replace("\\", "\\\\"), + name, + f0nsf_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + else: + opt.append( + "%s/%s.wav|%s/%s.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + feature_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + fea_dim = 256 if version19 == "v1" else 768 + if if_f0_3: + for _ in range(2): + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) + ) + else: + for _ in range(2): + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" + % (now_dir, sr2, now_dir, fea_dim, spk_id5) + ) + shuffle(opt) + with open("%s/filelist.txt" % exp_dir, "w") as f: + f.write("\n".join(opt)) + logger.debug("Write filelist done") + # 生成config#无需生成config + # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0" + logger.info("Use gpus: %s", str(gpus16)) + if pretrained_G14 == "": + logger.info("No pretrained Generator") + if pretrained_D15 == "": + logger.info("No pretrained Discriminator") + if version19 == "v1" or sr2 == "40k": + config_path = "v1/%s.json" % sr2 + else: + config_path = "v2/%s.json" % sr2 + config_save_path = os.path.join(exp_dir, "config.json") + if not pathlib.Path(config_save_path).exists(): + with open(config_save_path, "w", encoding="utf-8") as f: + json.dump( + config.json_config[config_path], + f, + ensure_ascii=False, + indent=4, + sort_keys=True, + ) + f.write("\n") + if gpus16: + cmd = ( + '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + % ( + config.python_cmd, + exp_dir1, + sr2, + 1 if if_f0_3 else 0, + batch_size12, + gpus16, + total_epoch11, + save_epoch10, + "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", + "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", + 1 if if_save_latest13 == i18n("是") else 0, + 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, + ) + ) + else: + cmd = ( + '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + % ( + config.python_cmd, + exp_dir1, + sr2, + 1 if if_f0_3 else 0, + batch_size12, + total_epoch11, + save_epoch10, + "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", + "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", + 1 if if_save_latest13 == i18n("是") else 0, + 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, + ) + ) + logger.info("Execute: " + cmd) + p = Popen(cmd, shell=True, cwd=now_dir) + p.wait() + return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log" + + +# but4.click(train_index, [exp_dir1], info3) +def train_index(exp_dir1, version19): + # exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) + exp_dir = "logs/%s" % (exp_dir1) + os.makedirs(exp_dir, exist_ok=True) + feature_dir = ( + "%s/3_feature256" % (exp_dir) + if version19 == "v1" + else "%s/3_feature768" % (exp_dir) + ) + if not os.path.exists(feature_dir): + return "请先进行特征提取!" + listdir_res = list(os.listdir(feature_dir)) + if len(listdir_res) == 0: + return "请先进行特征提取!" + infos = [] + npys = [] + for name in sorted(listdir_res): + phone = np.load("%s/%s" % (feature_dir, name)) + npys.append(phone) + big_npy = np.concatenate(npys, 0) + big_npy_idx = np.arange(big_npy.shape[0]) + np.random.shuffle(big_npy_idx) + big_npy = big_npy[big_npy_idx] + if big_npy.shape[0] > 2e5: + infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]) + yield "\n".join(infos) + try: + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + verbose=True, + batch_size=256 * config.n_cpu, + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + except: + info = traceback.format_exc() + logger.info(info) + infos.append(info) + yield "\n".join(infos) + + np.save("%s/total_fea.npy" % exp_dir, big_npy) + n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) + infos.append("%s,%s" % (big_npy.shape, n_ivf)) + yield "\n".join(infos) + index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf) + # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf) + infos.append("training") + yield "\n".join(infos) + index_ivf = faiss.extract_index_ivf(index) # + index_ivf.nprobe = 1 + index.train(big_npy) + faiss.write_index( + index, + "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" + % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), + ) + infos.append("adding") + yield "\n".join(infos) + batch_size_add = 8192 + for i in range(0, big_npy.shape[0], batch_size_add): + index.add(big_npy[i : i + batch_size_add]) + faiss.write_index( + index, + "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" + % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), + ) + infos.append( + "成功构建索引 added_IVF%s_Flat_nprobe_%s_%s_%s.index" + % (n_ivf, index_ivf.nprobe, exp_dir1, version19) + ) + try: + link = os.link if platform.system() == "Windows" else os.symlink + link( + "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" + % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), + "%s/%s_IVF%s_Flat_nprobe_%s_%s_%s.index" + % ( + outside_index_root, + exp_dir1, + n_ivf, + index_ivf.nprobe, + exp_dir1, + version19, + ), + ) + infos.append("链接索引到外部-%s" % (outside_index_root)) + except: + infos.append("链接索引到外部-%s失败" % (outside_index_root)) + + # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) + # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19)) + yield "\n".join(infos) + + +# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3) +def train1key( + exp_dir1, + sr2, + if_f0_3, + trainset_dir4, + spk_id5, + np7, + f0method8, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + gpus_rmvpe, +): + infos = [] + + def get_info_str(strr): + infos.append(strr) + return "\n".join(infos) + + # step1:处理数据 + yield get_info_str(i18n("step1:正在处理数据")) + [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] + + # step2a:提取音高 + yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) + [ + get_info_str(_) + for _ in extract_f0_feature( + gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe + ) + ] + + # step3a:训练模型 + yield get_info_str(i18n("step3a:正在训练模型")) + click_train( + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + ) + yield get_info_str( + i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log") + ) + + # step3b:训练索引 + [get_info_str(_) for _ in train_index(exp_dir1, version19)] + yield get_info_str(i18n("全流程结束!")) + + +# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) +def change_info_(ckpt_path): + if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")): + return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + try: + with open( + ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r" + ) as f: + info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1]) + sr, f0 = info["sample_rate"], info["if_f0"] + version = "v2" if ("version" in info and info["version"] == "v2") else "v1" + return sr, str(f0), version + except: + traceback.print_exc() + return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + + +F0GPUVisible = config.dml == False + + +def change_f0_method(f0method8): + if f0method8 == "rmvpe_gpu": + visible = F0GPUVisible + else: + visible = False + return {"visible": visible, "__type__": "update"} + + +with gr.Blocks(title="RVC WebUI") as app: + gr.Markdown("## RVC WebUI") + gr.Markdown( + value=i18n( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + ) + ) + with gr.Tabs(): + with gr.TabItem(i18n("模型推理")): + with gr.Row(): + sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) + with gr.Column(): + refresh_button = gr.Button( + i18n("刷新音色列表和索引路径"), variant="primary" + ) + clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") + spk_item = gr.Slider( + minimum=0, + maximum=2333, + step=1, + label=i18n("请选择说话人id"), + value=0, + visible=False, + interactive=True, + ) + clean_button.click( + fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" + ) + with gr.TabItem(i18n("单次推理")): + with gr.Group(): + with gr.Row(): + with gr.Column(): + vc_transform0 = gr.Number( + label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), + value=0, + ) + input_audio0 = gr.Textbox( + label=i18n( + "输入待处理音频文件路径(默认是正确格式示例)" + ), + placeholder="C:\\Users\\Desktop\\audio_example.wav", + ) + file_index1 = gr.Textbox( + label=i18n( + "特征检索库文件路径,为空则使用下拉的选择结果" + ), + placeholder="C:\\Users\\Desktop\\model_example.index", + interactive=True, + ) + file_index2 = gr.Dropdown( + label=i18n("自动检测index路径,下拉式选择(dropdown)"), + choices=sorted(index_paths), + interactive=True, + ) + f0method0 = gr.Radio( + label=i18n( + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" + ), + choices=( + ["pm", "harvest", "crepe", "rmvpe"] + if config.dml == False + else ["pm", "harvest", "rmvpe"] + ), + value="rmvpe", + interactive=True, + ) + + with gr.Column(): + resample_sr0 = gr.Slider( + minimum=0, + maximum=48000, + label=i18n("后处理重采样至最终采样率,0为不进行重采样"), + value=0, + step=1, + interactive=True, + ) + rms_mix_rate0 = gr.Slider( + minimum=0, + maximum=1, + label=i18n( + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络" + ), + value=0.25, + interactive=True, + ) + protect0 = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n( + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果" + ), + value=0.33, + step=0.01, + interactive=True, + ) + filter_radius0 = gr.Slider( + minimum=0, + maximum=7, + label=i18n( + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音" + ), + value=3, + step=1, + interactive=True, + ) + index_rate1 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=0.75, + interactive=True, + ) + f0_file = gr.File( + label=i18n( + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调" + ), + visible=False, + ) + + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[sid0, file_index2], + api_name="infer_refresh", + ) + # file_big_npy1 = gr.Textbox( + # label=i18n("特征文件路径"), + # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", + # interactive=True, + # ) + with gr.Group(): + with gr.Column(): + but0 = gr.Button(i18n("转换"), variant="primary") + with gr.Row(): + vc_output1 = gr.Textbox(label=i18n("输出信息")) + vc_output2 = gr.Audio( + label=i18n("输出音频(右下角三个点,点了可以下载)") + ) + + but0.click( + vc.vc_single, + [ + spk_item, + input_audio0, + vc_transform0, + f0_file, + f0method0, + file_index1, + file_index2, + # file_big_npy1, + index_rate1, + filter_radius0, + resample_sr0, + rms_mix_rate0, + protect0, + ], + [vc_output1, vc_output2], + api_name="infer_convert", + ) + with gr.TabItem(i18n("批量推理")): + gr.Markdown( + value=i18n( + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. " + ) + ) + with gr.Row(): + with gr.Column(): + vc_transform1 = gr.Number( + label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), + value=0, + ) + opt_input = gr.Textbox( + label=i18n("指定输出文件夹"), value="opt" + ) + file_index3 = gr.Textbox( + label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), + value="", + interactive=True, + ) + file_index4 = gr.Dropdown( + label=i18n("自动检测index路径,下拉式选择(dropdown)"), + choices=sorted(index_paths), + interactive=True, + ) + f0method1 = gr.Radio( + label=i18n( + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU" + ), + choices=( + ["pm", "harvest", "crepe", "rmvpe"] + if config.dml == False + else ["pm", "harvest", "rmvpe"] + ), + value="rmvpe", + interactive=True, + ) + format1 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="wav", + interactive=True, + ) + + refresh_button.click( + fn=lambda: change_choices()[1], + inputs=[], + outputs=file_index4, + api_name="infer_refresh_batch", + ) + # file_big_npy2 = gr.Textbox( + # label=i18n("特征文件路径"), + # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", + # interactive=True, + # ) + + with gr.Column(): + resample_sr1 = gr.Slider( + minimum=0, + maximum=48000, + label=i18n("后处理重采样至最终采样率,0为不进行重采样"), + value=0, + step=1, + interactive=True, + ) + rms_mix_rate1 = gr.Slider( + minimum=0, + maximum=1, + label=i18n( + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络" + ), + value=1, + interactive=True, + ) + protect1 = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n( + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果" + ), + value=0.33, + step=0.01, + interactive=True, + ) + filter_radius1 = gr.Slider( + minimum=0, + maximum=7, + label=i18n( + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音" + ), + value=3, + step=1, + interactive=True, + ) + index_rate2 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=1, + interactive=True, + ) + with gr.Row(): + dir_input = gr.Textbox( + label=i18n( + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)" + ), + placeholder="C:\\Users\\Desktop\\input_vocal_dir", + ) + inputs = gr.File( + file_count="multiple", + label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"), + ) + + with gr.Row(): + but1 = gr.Button(i18n("转换"), variant="primary") + vc_output3 = gr.Textbox(label=i18n("输出信息")) + + but1.click( + vc.vc_multi, + [ + spk_item, + dir_input, + opt_input, + inputs, + vc_transform1, + f0method1, + file_index3, + file_index4, + # file_big_npy2, + index_rate2, + filter_radius1, + resample_sr1, + rms_mix_rate1, + protect1, + format1, + ], + [vc_output3], + api_name="infer_convert_batch", + ) + sid0.change( + fn=vc.get_vc, + inputs=[sid0, protect0, protect1], + outputs=[spk_item, protect0, protect1, file_index2, file_index4], + api_name="infer_change_voice", + ) + with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): + with gr.Group(): + gr.Markdown( + value=i18n( + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。" + ) + ) + with gr.Row(): + with gr.Column(): + dir_wav_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径"), + placeholder="C:\\Users\\Desktop\\todo-songs", + ) + wav_inputs = gr.File( + file_count="multiple", + label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"), + ) + with gr.Column(): + model_choose = gr.Dropdown( + label=i18n("模型"), choices=uvr5_names + ) + agg = gr.Slider( + minimum=0, + maximum=20, + step=1, + label="人声提取激进程度", + value=10, + interactive=True, + visible=False, # 先不开放调整 + ) + opt_vocal_root = gr.Textbox( + label=i18n("指定输出主人声文件夹"), value="opt" + ) + opt_ins_root = gr.Textbox( + label=i18n("指定输出非主人声文件夹"), value="opt" + ) + format0 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="flac", + interactive=True, + ) + but2 = gr.Button(i18n("转换"), variant="primary") + vc_output4 = gr.Textbox(label=i18n("输出信息")) + but2.click( + uvr, + [ + model_choose, + dir_wav_input, + opt_vocal_root, + wav_inputs, + opt_ins_root, + agg, + format0, + ], + [vc_output4], + api_name="uvr_convert", + ) + with gr.TabItem(i18n("训练")): + gr.Markdown( + value=i18n( + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. " + ) + ) + with gr.Row(): + exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test") + sr2 = gr.Radio( + label=i18n("目标采样率"), + choices=["40k", "48k"], + value="40k", + interactive=True, + ) + if_f0_3 = gr.Radio( + label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"), + choices=[True, False], + value=True, + interactive=True, + ) + version19 = gr.Radio( + label=i18n("版本"), + choices=["v1", "v2"], + value="v2", + interactive=True, + visible=True, + ) + np7 = gr.Slider( + minimum=0, + maximum=config.n_cpu, + step=1, + label=i18n("提取音高和处理数据使用的CPU进程数"), + value=int(np.ceil(config.n_cpu / 1.5)), + interactive=True, + ) + with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理 + gr.Markdown( + value=i18n( + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. " + ) + ) + with gr.Row(): + trainset_dir4 = gr.Textbox( + label=i18n("输入训练文件夹路径"), + value=i18n("E:\\语音音频+标注\\米津玄师\\src"), + ) + spk_id5 = gr.Slider( + minimum=0, + maximum=4, + step=1, + label=i18n("请指定说话人id"), + value=0, + interactive=True, + ) + but1 = gr.Button(i18n("处理数据"), variant="primary") + info1 = gr.Textbox(label=i18n("输出信息"), value="") + but1.click( + preprocess_dataset, + [trainset_dir4, exp_dir1, sr2, np7], + [info1], + api_name="train_preprocess", + ) + with gr.Group(): + gr.Markdown( + value=i18n( + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)" + ) + ) + with gr.Row(): + with gr.Column(): + gpus6 = gr.Textbox( + label=i18n( + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2" + ), + value=gpus, + interactive=True, + visible=F0GPUVisible, + ) + gpu_info9 = gr.Textbox( + label=i18n("显卡信息"), value=gpu_info, visible=F0GPUVisible + ) + with gr.Column(): + f0method8 = gr.Radio( + label=i18n( + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU" + ), + choices=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"], + value="rmvpe_gpu", + interactive=True, + ) + gpus_rmvpe = gr.Textbox( + label=i18n( + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程" + ), + value="%s-%s" % (gpus, gpus), + interactive=True, + visible=F0GPUVisible, + ) + but2 = gr.Button(i18n("特征提取"), variant="primary") + info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + f0method8.change( + fn=change_f0_method, + inputs=[f0method8], + outputs=[gpus_rmvpe], + ) + but2.click( + extract_f0_feature, + [ + gpus6, + np7, + f0method8, + if_f0_3, + exp_dir1, + version19, + gpus_rmvpe, + ], + [info2], + api_name="train_extract_f0_feature", + ) + with gr.Group(): + gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引")) + with gr.Row(): + save_epoch10 = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + total_epoch11 = gr.Slider( + minimum=2, + maximum=1000, + step=1, + label=i18n("总训练轮数total_epoch"), + value=20, + interactive=True, + ) + batch_size12 = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + if_save_latest13 = gr.Radio( + label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), + choices=[i18n("是"), i18n("否")], + value=i18n("否"), + interactive=True, + ) + if_cache_gpu17 = gr.Radio( + label=i18n( + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速" + ), + choices=[i18n("是"), i18n("否")], + value=i18n("否"), + interactive=True, + ) + if_save_every_weights18 = gr.Radio( + label=i18n( + "是否在每次保存时间点将最终小模型保存至weights文件夹" + ), + choices=[i18n("是"), i18n("否")], + value=i18n("否"), + interactive=True, + ) + with gr.Row(): + pretrained_G14 = gr.Textbox( + label=i18n("加载预训练底模G路径"), + value="assets/pretrained_v2/f0G40k.pth", + interactive=True, + ) + pretrained_D15 = gr.Textbox( + label=i18n("加载预训练底模D路径"), + value="assets/pretrained_v2/f0D40k.pth", + interactive=True, + ) + sr2.change( + change_sr2, + [sr2, if_f0_3, version19], + [pretrained_G14, pretrained_D15], + ) + version19.change( + change_version19, + [sr2, if_f0_3, version19], + [pretrained_G14, pretrained_D15, sr2], + ) + if_f0_3.change( + change_f0, + [if_f0_3, sr2, version19], + [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15], + ) + gpus16 = gr.Textbox( + label=i18n( + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2" + ), + value=gpus, + interactive=True, + ) + but3 = gr.Button(i18n("训练模型"), variant="primary") + but4 = gr.Button(i18n("训练特征索引"), variant="primary") + but5 = gr.Button(i18n("一键训练"), variant="primary") + info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10) + but3.click( + click_train, + [ + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + ], + info3, + api_name="train_start", + ) + but4.click(train_index, [exp_dir1, version19], info3) + but5.click( + train1key, + [ + exp_dir1, + sr2, + if_f0_3, + trainset_dir4, + spk_id5, + np7, + f0method8, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + gpus_rmvpe, + ], + info3, + api_name="train_start_all", + ) + + with gr.TabItem(i18n("ckpt处理")): + with gr.Group(): + gr.Markdown(value=i18n("模型融合, 可用于测试音色融合")) + with gr.Row(): + ckpt_a = gr.Textbox( + label=i18n("A模型路径"), value="", interactive=True + ) + ckpt_b = gr.Textbox( + label=i18n("B模型路径"), value="", interactive=True + ) + alpha_a = gr.Slider( + minimum=0, + maximum=1, + label=i18n("A模型权重"), + value=0.5, + interactive=True, + ) + with gr.Row(): + sr_ = gr.Radio( + label=i18n("目标采样率"), + choices=["40k", "48k"], + value="40k", + interactive=True, + ) + if_f0_ = gr.Radio( + label=i18n("模型是否带音高指导"), + choices=[i18n("是"), i18n("否")], + value=i18n("是"), + interactive=True, + ) + info__ = gr.Textbox( + label=i18n("要置入的模型信息"), + value="", + max_lines=8, + interactive=True, + ) + name_to_save0 = gr.Textbox( + label=i18n("保存的模型名不带后缀"), + value="", + max_lines=1, + interactive=True, + ) + version_2 = gr.Radio( + label=i18n("模型版本型号"), + choices=["v1", "v2"], + value="v1", + interactive=True, + ) + with gr.Row(): + but6 = gr.Button(i18n("融合"), variant="primary") + info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + but6.click( + merge, + [ + ckpt_a, + ckpt_b, + alpha_a, + sr_, + if_f0_, + info__, + name_to_save0, + version_2, + ], + info4, + api_name="ckpt_merge", + ) # def merge(path1,path2,alpha1,sr,f0,info): + with gr.Group(): + gr.Markdown( + value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)") + ) + with gr.Row(): + ckpt_path0 = gr.Textbox( + label=i18n("模型路径"), value="", interactive=True + ) + info_ = gr.Textbox( + label=i18n("要改的模型信息"), + value="", + max_lines=8, + interactive=True, + ) + name_to_save1 = gr.Textbox( + label=i18n("保存的文件名, 默认空为和源文件同名"), + value="", + max_lines=8, + interactive=True, + ) + with gr.Row(): + but7 = gr.Button(i18n("修改"), variant="primary") + info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + but7.click( + change_info, + [ckpt_path0, info_, name_to_save1], + info5, + api_name="ckpt_modify", + ) + with gr.Group(): + gr.Markdown( + value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)") + ) + with gr.Row(): + ckpt_path1 = gr.Textbox( + label=i18n("模型路径"), value="", interactive=True + ) + but8 = gr.Button(i18n("查看"), variant="primary") + info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + but8.click(show_info, [ckpt_path1], info6, api_name="ckpt_show") + with gr.Group(): + gr.Markdown( + value=i18n( + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况" + ) + ) + with gr.Row(): + ckpt_path2 = gr.Textbox( + label=i18n("模型路径"), + value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth", + interactive=True, + ) + save_name = gr.Textbox( + label=i18n("保存名"), value="", interactive=True + ) + sr__ = gr.Radio( + label=i18n("目标采样率"), + choices=["32k", "40k", "48k"], + value="40k", + interactive=True, + ) + if_f0__ = gr.Radio( + label=i18n("模型是否带音高指导,1是0否"), + choices=["1", "0"], + value="1", + interactive=True, + ) + version_1 = gr.Radio( + label=i18n("模型版本型号"), + choices=["v1", "v2"], + value="v2", + interactive=True, + ) + info___ = gr.Textbox( + label=i18n("要置入的模型信息"), + value="", + max_lines=8, + interactive=True, + ) + but9 = gr.Button(i18n("提取"), variant="primary") + info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + ckpt_path2.change( + change_info_, [ckpt_path2], [sr__, if_f0__, version_1] + ) + but9.click( + extract_small_model, + [ckpt_path2, save_name, sr__, if_f0__, info___, version_1], + info7, + api_name="ckpt_extract", + ) + + with gr.TabItem(i18n("Onnx导出")): + with gr.Row(): + ckpt_dir = gr.Textbox( + label=i18n("RVC模型路径"), value="", interactive=True + ) + with gr.Row(): + onnx_dir = gr.Textbox( + label=i18n("Onnx输出路径"), value="", interactive=True + ) + with gr.Row(): + infoOnnx = gr.Label(label="info") + with gr.Row(): + butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary") + butOnnx.click( + export_onnx, [ckpt_dir, onnx_dir], infoOnnx, api_name="export_onnx" + ) + + tab_faq = i18n("常见问题解答") + with gr.TabItem(tab_faq): + try: + if tab_faq == "常见问题解答": + with open("docs/cn/faq.md", "r", encoding="utf8") as f: + info = f.read() + else: + with open("docs/en/faq_en.md", "r", encoding="utf8") as f: + info = f.read() + gr.Markdown(value=info) + except: + gr.Markdown(traceback.format_exc()) + + if config.iscolab: + app.queue(concurrency_count=511, max_size=1022).launch(share=True) + else: + app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=not config.noautoopen, + server_port=config.listen_port, + quiet=True, + ) \ No newline at end of file diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index 765c54c61..8184ca004 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -235,8 +235,9 @@ def plot_spectrogram_to_numpy(spectrogram): plt.tight_layout() fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + # Fix for newer matplotlib versions + buf = fig.canvas.buffer_rgba() + data = np.asarray(buf)[:, :, :3] plt.close() return data @@ -266,8 +267,9 @@ def plot_alignment_to_numpy(alignment, info=None): plt.tight_layout() fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + # Fix for newer matplotlib versions + buf = fig.canvas.buffer_rgba() + data = np.asarray(buf)[:, :, :3] plt.close() return data diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py index 96a69dee4..1f9d725b9 100644 --- a/infer/modules/train/extract_feature_print.py +++ b/infer/modules/train/extract_feature_print.py @@ -86,6 +86,15 @@ def readwave(wav_path, normalize=False): % model_path ) exit(0) + +# Fix for PyTorch 2.6+ weights_only default change +import torch.serialization +try: + import fairseq.data.dictionary + torch.serialization.add_safe_globals([fairseq.data.dictionary.Dictionary]) +except: + pass + models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( [model_path], suffix="", diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index 38a567828..aefdd0349 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -18,6 +18,22 @@ import torch +# Device detection for MPS (Apple Silicon), CUDA, or CPU +USE_MPS = False +USE_CUDA = False +DEVICE = "cpu" + +if torch.backends.mps.is_available(): + USE_MPS = True + DEVICE = "mps" + print("Using Apple Silicon MPS GPU acceleration") +elif torch.cuda.is_available(): + USE_CUDA = True + DEVICE = "cuda" + print("Using NVIDIA CUDA GPU acceleration") +else: + print("No GPU detected, using CPU") + try: import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import @@ -79,6 +95,15 @@ global_step = 0 +def to_device(tensor, rank=0): + """Move tensor to appropriate device (MPS, CUDA, or CPU)""" + if USE_MPS: + return tensor.to("mps") + elif USE_CUDA: + return tensor.cuda(rank, non_blocking=True) + return tensor + + class EpochRecorder: def __init__(self): self.last_time = ttime() @@ -167,7 +192,7 @@ def run(rank, n_gpus, hps, logger: logging.Logger): hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model, - is_half=hps.train.fp16_run, + is_half=hps.train.fp16_run and not USE_MPS, # MPS doesn't support fp16 well sr=hps.sample_rate, ) else: @@ -175,12 +200,17 @@ def run(rank, n_gpus, hps, logger: logging.Logger): hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model, - is_half=hps.train.fp16_run, + is_half=hps.train.fp16_run and not USE_MPS, ) - if torch.cuda.is_available(): + # Move models to device + if USE_MPS: + net_g = net_g.to("mps") + elif USE_CUDA: net_g = net_g.cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) - if torch.cuda.is_available(): + if USE_MPS: + net_d = net_d.to("mps") + elif USE_CUDA: net_d = net_d.cuda(rank) optim_g = torch.optim.AdamW( net_g.parameters(), @@ -194,16 +224,16 @@ def run(rank, n_gpus, hps, logger: logging.Logger): betas=hps.train.betas, eps=hps.train.eps, ) - # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) - # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + # DDP wrapping (not used for MPS single-GPU) if hasattr(torch, "xpu") and torch.xpu.is_available(): pass - elif torch.cuda.is_available(): + elif USE_CUDA: net_g = DDP(net_g, device_ids=[rank]) net_d = DDP(net_d, device_ids=[rank]) - else: + elif not USE_MPS: net_g = DDP(net_g) net_d = DDP(net_d) + # MPS: no DDP needed for single GPU try: # 如果能加载自动resume _, _, _, epoch_str = utils.load_checkpoint( @@ -260,7 +290,8 @@ def run(rank, n_gpus, hps, logger: logging.Logger): optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) - scaler = GradScaler(enabled=hps.train.fp16_run) + # MPS doesn't support fp16 GradScaler well + scaler = GradScaler(enabled=hps.train.fp16_run and not USE_MPS) cache = [] for epoch in range(epoch_str, hps.train.epochs + 1): @@ -341,18 +372,18 @@ def train_and_evaluate( wave_lengths, sid, ) = info - # Load on CUDA - if torch.cuda.is_available(): - phone = phone.cuda(rank, non_blocking=True) - phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + # Load on GPU (CUDA or MPS) + if USE_CUDA or USE_MPS: + phone = to_device(phone, rank) + phone_lengths = to_device(phone_lengths, rank) if hps.if_f0 == 1: - pitch = pitch.cuda(rank, non_blocking=True) - pitchf = pitchf.cuda(rank, non_blocking=True) - sid = sid.cuda(rank, non_blocking=True) - spec = spec.cuda(rank, non_blocking=True) - spec_lengths = spec_lengths.cuda(rank, non_blocking=True) - wave = wave.cuda(rank, non_blocking=True) - wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + pitch = to_device(pitch, rank) + pitchf = to_device(pitchf, rank) + sid = to_device(sid, rank) + spec = to_device(spec, rank) + spec_lengths = to_device(spec_lengths, rank) + wave = to_device(wave, rank) + wave_lengths = to_device(wave_lengths, rank) # Cache on list if hps.if_f0 == 1: cache.append( @@ -412,21 +443,20 @@ def train_and_evaluate( ) = info else: phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info - ## Load on CUDA - if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): - phone = phone.cuda(rank, non_blocking=True) - phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + ## Load on GPU (CUDA or MPS) + if (hps.if_cache_data_in_gpu == False) and (USE_CUDA or USE_MPS): + phone = to_device(phone, rank) + phone_lengths = to_device(phone_lengths, rank) if hps.if_f0 == 1: - pitch = pitch.cuda(rank, non_blocking=True) - pitchf = pitchf.cuda(rank, non_blocking=True) - sid = sid.cuda(rank, non_blocking=True) - spec = spec.cuda(rank, non_blocking=True) - spec_lengths = spec_lengths.cuda(rank, non_blocking=True) - wave = wave.cuda(rank, non_blocking=True) - # wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + pitch = to_device(pitch, rank) + pitchf = to_device(pitchf, rank) + sid = to_device(sid, rank) + spec = to_device(spec, rank) + spec_lengths = to_device(spec_lengths, rank) + wave = to_device(wave, rank) # Calculate - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.fp16_run and not USE_MPS): if hps.if_f0 == 1: ( y_hat, diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py index c128707cf..1e4afde83 100644 --- a/infer/modules/vc/utils.py +++ b/infer/modules/vc/utils.py @@ -1,7 +1,17 @@ import os +import torch from fairseq import checkpoint_utils +# PyTorch 2.6+ compatibility: weights_only=True by default breaks fairseq loading +# Monkey-patch torch.load to use weights_only=False for model loading +_original_torch_load = torch.load +def _patched_torch_load(*args, **kwargs): + if 'weights_only' not in kwargs: + kwargs['weights_only'] = False + return _original_torch_load(*args, **kwargs) +torch.load = _patched_torch_load + def get_index_path_from_model(sid): return next( diff --git a/inference_log.txt b/inference_log.txt new file mode 100644 index 000000000..2b53e1c21 --- /dev/null +++ b/inference_log.txt @@ -0,0 +1,24 @@ +/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/pyworld/__init__.py:13: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +2025-12-08 01:10:34 | INFO | configs.config | No supported Nvidia GPU found +2025-12-08 01:10:34 | INFO | configs.config | MPS available but using CPU for stability +2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/32k.json +2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/40k.json +2025-12-08 01:10:34 | INFO | configs.config | overwrite v1/48k.json +2025-12-08 01:10:34 | INFO | configs.config | overwrite v2/48k.json +2025-12-08 01:10:34 | INFO | configs.config | overwrite v2/32k.json +2025-12-08 01:10:34 | INFO | configs.config | overwrite preprocess_per to 3 +2025-12-08 01:10:34 | INFO | configs.config | Use cpu instead +2025-12-08 01:10:34 | INFO | configs.config | Half-precision floating-point: False, device: cpu +2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Get sid: Voice_New.pth +2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Loading: assets/weights/Voice_New.pth +/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +2025-12-08 01:10:34 | INFO | infer.modules.vc.modules | Select index: logs/Voice_New/added_IVF86_Flat_nprobe_1.index +2025-12-08 01:10:34 | INFO | fairseq.tasks.hubert_pretraining | current directory is /Users/arunkumarv/Music/Voice Clone/rvc-webui +2025-12-08 01:10:34 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False} +2025-12-08 01:10:34 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False} +/opt/anaconda3/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/opt/anaconda3/envs/rvc/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/run_inference.py b/run_inference.py new file mode 100644 index 000000000..f77debf23 --- /dev/null +++ b/run_inference.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +import os +import sys + +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + +import torch +import faiss +faiss.omp_set_num_threads(1) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from dotenv import load_dotenv +load_dotenv() + +os.environ['weight_root'] = 'assets/weights' +os.environ['index_root'] = 'logs' +os.environ['rmvpe_root'] = 'assets/rmvpe' + +print(f"PyTorch version: {torch.__version__}") +print(f"CUDA available: {torch.cuda.is_available()}") +print(f"MPS available: {torch.backends.mps.is_available() if hasattr(torch.backends, 'mps') else False}") + +from configs.config import Config +config = Config() +print(f"Device selected by config: {config.device}") + +from infer.modules.vc.modules import VC +vc_instance = VC(config) + +model_name = "Voice_New.pth" +input_audio = "/Users/arunkumarv/Music/Voice Clone/Voice_convert.mp3" +output_audio = "/Users/arunkumarv/Music/Voice Clone/rvc-webui/output/Voice_New/converted.wav" + +os.makedirs(os.path.dirname(output_audio), exist_ok=True) + +print(f"\nLoading model: {model_name}") +vc_instance.get_vc(model_name) + +print(f"Converting audio: {input_audio}") +print(f"Output will be saved to: {output_audio}") + +print("Starting vc_single...") +sys.stdout.flush() + +import soundfile as sf + +try: + result_message, audio_result = vc_instance.vc_single( + sid=0, + input_audio_path=input_audio, + f0_up_key=0, + f0_file=None, + f0_method="rmvpe", + file_index=f"logs/Voice_New/added_IVF86_Flat_nprobe_1.index", + file_index2="", + index_rate=0.75, + filter_radius=3, + resample_sr=0, + rms_mix_rate=0.25, + protect=0.33 + ) + + print(f"\nResult: {result_message}") + + sample_rate, audio_data = audio_result + if audio_data is not None and sample_rate is not None: + sf.write(output_audio, audio_data, sample_rate) + print(f"✓ Audio saved successfully to: {output_audio}") + else: + print("✗ Conversion failed!") + sys.exit(1) +except Exception as e: + import traceback + print(f"Error: {e}") + traceback.print_exc() + sys.exit(1) diff --git a/run_inference_api.py b/run_inference_api.py new file mode 100644 index 000000000..e1fcfc7ad --- /dev/null +++ b/run_inference_api.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +from gradio_client import Client +import os + +client = Client("http://localhost:7865") + +input_audio = "/Users/arunkumarv/Music/Voice Clone/Voice_convert.mp3" +output_dir = "/Users/arunkumarv/Music/Voice Clone/rvc-webui/output/Voice_New" +os.makedirs(output_dir, exist_ok=True) + +print(f"Submitting inference request...") +print(f"Input: {input_audio}") +print(f"Model: Voice_New.pth") +print(f"F0 Method: pm") + +result = client.predict( + spk_item="Voice_New.pth", + input_audio0=input_audio, + vc_transform0=0, # pitch shift + f0_file=None, + f0method0="pm", # F0 method + file_index1="", # manual index path + file_index2="logs/Voice_New/added_IVF86_Flat_nprobe_1.index", # dropdown selection + index_rate1=0.75, # retrieval mix + filter_radius0=3, # median filter + resample_sr0=0, # output sample rate + rms_mix_rate0=0.25, # volume envelope + protect0=0.33, # consonant protection + api_name="/infer_convert" +) + +output_message, output_audio_tuple = result +print(f"\nResult: {output_message}") + +if output_audio_tuple and len(output_audio_tuple) > 1: + output_path = os.path.join(output_dir, "converted.wav") + # Gradio returns the audio file path + if isinstance(output_audio_tuple, tuple) and len(output_audio_tuple) == 2: + sr, audio_file = output_audio_tuple + print(f"✓ Audio converted successfully!") + print(f"Sample rate: {sr} Hz") + print(f"Output: {output_path}") + else: + print(f"Unexpected output format: {output_audio_tuple}") +else: + print("✗ Conversion failed!") diff --git a/setup-doc.md b/setup-doc.md new file mode 100644 index 000000000..153f17980 --- /dev/null +++ b/setup-doc.md @@ -0,0 +1,67 @@ +# RVC WebUI Setup Documentation + +## Environment + +- macOS (Apple Silicon) +- Python 3.10 (Conda) +- PyTorch with MPS support + +## Problems and Solutions + +### 1. fairseq Installation Failure + +**Error**: `omegaconf` metadata parsing error with pip 25.x + +**Fix**: +```bash +pip install "pip<24.1" +pip install fairseq==0.12.2 +``` + +### 2. Gradio Version Mismatch + +**Error**: `concurrency_count` parameter not recognized (wrong gradio version installed) + +**Fix**: +```bash +pip install gradio==3.34.0 +``` + +### 3. gradio_client.serializing Module Not Found + +**Error**: `ModuleNotFoundError: No module named 'gradio_client.serializing'` + +**Cause**: Modern gradio-client (2.x) removed the serializing module that gradio 3.34.0 expects + +**Fix**: +```bash +pip install gradio-client==0.2.7 +``` + +## Working Installation Sequence + +```bash +conda create -n rvc python=3.10 -y +conda activate rvc + +pip install "pip<24.1" +pip install torch torchvision torchaudio +pip install -r requirements.txt +pip install fairseq==0.12.2 +pip install gradio==3.34.0 +pip install gradio-client==0.2.7 + +python tools/download_models.py +python infer-web.py +``` + +## Verification + +```bash +python -c "import torch; print(torch.backends.mps.is_available())" +python -c "from gradio_client.serializing import Serializable; print('OK')" +``` + +## Access + +WebUI runs at: http://localhost:7865 diff --git a/temp_download_dialects.py b/temp_download_dialects.py new file mode 100644 index 000000000..c93b14491 --- /dev/null +++ b/temp_download_dialects.py @@ -0,0 +1,33 @@ + +import os +from datasets import load_dataset +from pathlib import Path +import soundfile as sf + +print("Loading English Dialects dataset...") +ds = load_dataset("ylacombe/english_dialects", split="train", streaming=True) + +datasets_dir = Path("datasets/accent_non_native") +datasets_dir.mkdir(parents=True, exist_ok=True) + +count = 0 +max_samples = 150 + +# Target dialects for "non-native" feel (regional accents) +target_dialects = ["scottish", "irish", "welsh", "northern"] + +for sample in ds: + dialect = sample.get("dialect", "").lower() + + if any(d in dialect for d in target_dialects) and count < max_samples: + audio = sample["audio"] + out_path = datasets_dir / f"{dialect}_{count}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + count += 1 + print(f"Saved {out_path.name} (total: {count})") + + if count >= max_samples: + break + +print(f"Downloaded {count} dialect samples") diff --git a/temp_download_genshin.py b/temp_download_genshin.py new file mode 100644 index 000000000..27e2e38eb --- /dev/null +++ b/temp_download_genshin.py @@ -0,0 +1,38 @@ + +import os +from datasets import load_dataset +from pathlib import Path +import soundfile as sf + +print("Loading Genshin Voice dataset...") +ds = load_dataset("simon3000/genshin-voice", split="train", streaming=True) + +datasets_dir = Path("datasets/anime_airy") +datasets_dir.mkdir(parents=True, exist_ok=True) + +count = 0 +max_samples = 150 + +# Target characters with airy/cute voices (English) +target_chars = ["paimon", "barbara", "kokomi", "nahida", "klee", "qiqi", "diona"] + +for sample in ds: + speaker = str(sample.get("speaker", "")).lower() + lang = sample.get("language", "") + + # Only English samples + if lang != "en": + continue + + if any(char in speaker for char in target_chars) and count < max_samples: + audio = sample["audio"] + out_path = datasets_dir / f"{speaker.replace(' ', '_')}_{count}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + count += 1 + print(f"Saved {out_path.name} (total: {count})") + + if count >= max_samples: + break + +print(f"Downloaded {count} anime voice samples") diff --git a/temp_download_hifi.py b/temp_download_hifi.py new file mode 100644 index 000000000..7724e5af6 --- /dev/null +++ b/temp_download_hifi.py @@ -0,0 +1,46 @@ + +import os +from datasets import load_dataset +from pathlib import Path +import soundfile as sf + +print("Loading Hi-Fi TTS dataset...") +ds = load_dataset("MikhailT/hifi-tts", split="train", streaming=True) + +datasets_dir = Path("datasets") + +# Speaker ID to voice type mapping for HiFi-TTS +# HiFi has 10 speakers total +voice_map = { + "92": "male_low", # Deep male + "6097": "male_mid", # Mid male + "6670": "female_low", # Lower female + "6671": "female_high", # Higher female + "8051": "singing_male", + "9017": "singing_female", +} + +counts = {k: 0 for k in set(voice_map.values())} +max_per_type = 100 + +for sample in ds: + speaker = str(sample.get("speaker", "")) + + if speaker in voice_map: + voice_type = voice_map[speaker] + + if counts[voice_type] < max_per_type: + out_dir = datasets_dir / voice_type + out_dir.mkdir(parents=True, exist_ok=True) + + audio = sample["audio"] + out_path = out_dir / f"hifi_{speaker}_{counts[voice_type]}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + counts[voice_type] += 1 + print(f"Saved {out_path.name} ({voice_type}: {counts[voice_type]})") + + if all(c >= max_per_type for c in counts.values()): + break + +print(f"Final counts: {counts}") diff --git a/temp_download_libritts.py b/temp_download_libritts.py new file mode 100644 index 000000000..b8d7dd48f --- /dev/null +++ b/temp_download_libritts.py @@ -0,0 +1,45 @@ + +import os +from datasets import load_dataset +from pathlib import Path + +print("Loading LibriTTS dataset (this may take a while)...") +ds = load_dataset("mythicinfinity/libritts", "clean", split="train.clean.100", streaming=True) + +# Sample speakers - take first 50 samples per target voice type +# LibriTTS speaker IDs are in the 'speaker_id' column +target_speakers = { + "male_low": ["19", "26", "1272"], # Deep male voices + "male_mid": ["32", "40", "1089"], # Mid-range male + "female_low": ["87", "103", "1284"], # Lower female + "female_high": ["121", "237", "3570"], # Higher female +} + +datasets_dir = Path("datasets") +counts = {k: 0 for k in target_speakers} +max_per_type = 100 # Max samples per voice type + +for sample in ds: + speaker = str(sample.get("speaker_id", "")) + + for voice_type, speakers in target_speakers.items(): + if speaker in speakers and counts[voice_type] < max_per_type: + out_dir = datasets_dir / voice_type + out_dir.mkdir(parents=True, exist_ok=True) + + audio = sample["audio"] + out_path = out_dir / f"{speaker}_{sample['id']}.wav" + + # Save audio + import soundfile as sf + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + + counts[voice_type] += 1 + print(f"Saved {out_path.name} ({voice_type}: {counts[voice_type]})") + + # Check if we have enough + if all(c >= max_per_type for c in counts.values()): + print("Collected enough samples!") + break + +print(f"Final counts: {counts}") diff --git a/tools/audio_preprocessor.py b/tools/audio_preprocessor.py new file mode 100644 index 000000000..7e564169a --- /dev/null +++ b/tools/audio_preprocessor.py @@ -0,0 +1,81 @@ +import os +import argparse +import librosa +import soundfile as sf +import numpy as np +from pydub import AudioSegment +from pydub.silence import split_on_silence +from tqdm import tqdm + +def process_audio(input_path, output_dir, sr=40000, min_silence_len=500, silence_thresh=-40, chunk_len=10000): + """ + Process audio file: convert to wav, normalize, remove silence, split into chunks + """ + filename = os.path.basename(input_path).split('.')[0] + + print(f"Processing {input_path}...") + + # Load audio + try: + audio = AudioSegment.from_file(input_path) + except Exception as e: + print(f"Error loading {input_path}: {e}") + return + + # Normalize + audio = audio.normalize() + + # Split on silence + chunks = split_on_silence( + audio, + min_silence_len=min_silence_len, + silence_thresh=silence_thresh, + keep_silence=100 + ) + + # Combine small chunks to reach target length + output_chunks = [] + current_chunk = AudioSegment.empty() + + for chunk in chunks: + if len(current_chunk) + len(chunk) < chunk_len: + current_chunk += chunk + else: + output_chunks.append(current_chunk) + current_chunk = chunk + + if len(current_chunk) > 0: + output_chunks.append(current_chunk) + + # Save chunks + os.makedirs(output_dir, exist_ok=True) + + for i, chunk in enumerate(output_chunks): + # Convert to target sample rate + chunk = chunk.set_frame_rate(sr).set_channels(1) + + # Export + out_name = f"{filename}_{i:03d}.wav" + out_path = os.path.join(output_dir, out_name) + chunk.export(out_path, format="wav") + + print(f"Saved {len(output_chunks)} chunks to {output_dir}") + +def main(): + parser = argparse.ArgumentParser(description="Audio Dataset Preprocessor for RVC") + parser.add_argument("--input", "-i", required=True, help="Input file or directory") + parser.add_argument("--output", "-o", required=True, help="Output directory") + parser.add_argument("--sr", type=int, default=40000, help="Target sample rate (default: 40000)") + parser.add_argument("--len", type=int, default=10000, help="Target chunk length in ms (default: 10000)") + + args = parser.parse_args() + + if os.path.isfile(args.input): + process_audio(args.input, args.output, sr=args.sr, chunk_len=args.len) + elif os.path.isdir(args.input): + files = [f for f in os.listdir(args.input) if f.lower().endswith(('.wav', '.mp3', '.flac', '.m4a', '.ogg'))] + for f in tqdm(files): + process_audio(os.path.join(args.input, f), args.output, sr=args.sr, chunk_len=args.len) + +if __name__ == "__main__": + main() diff --git a/tools/download_datasets.py b/tools/download_datasets.py new file mode 100644 index 000000000..85407ea82 --- /dev/null +++ b/tools/download_datasets.py @@ -0,0 +1,246 @@ +""" +Simplified dataset downloader - downloads voice data directly. +""" +import os +import sys +from pathlib import Path + +# Ensure we have datasets library +try: + from datasets import load_dataset + import soundfile as sf +except ImportError: + print("Installing required packages...") + os.system("pip install datasets soundfile") + from datasets import load_dataset + import soundfile as sf + +# Setup paths +SCRIPT_DIR = Path(__file__).parent +PROJECT_ROOT = SCRIPT_DIR.parent +DATASETS_DIR = PROJECT_ROOT / "datasets" + +def download_libritts(): + """Download LibriTTS samples for male/female voices.""" + print("\n=== Downloading LibriTTS (Male/Female Voices) ===") + + ds = load_dataset("mythicinfinity/libritts", "clean", split="train.clean.100", streaming=True) + + # Speaker ID to voice type mapping + speaker_voice_map = { + "19": "male_low", + "26": "male_low", + "1272": "male_low", + "32": "male_mid", + "40": "male_mid", + "1089": "male_mid", + "87": "female_low", + "103": "female_low", + "1284": "female_low", + "121": "female_high", + "237": "female_high", + "3570": "female_high", + } + + counts = {} + for vt in set(speaker_voice_map.values()): + counts[vt] = 0 + (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True) + + max_per_type = 100 + + for sample in ds: + speaker = str(sample.get("speaker_id", "")) + + if speaker in speaker_voice_map: + voice_type = speaker_voice_map[speaker] + + if counts[voice_type] < max_per_type: + out_dir = DATASETS_DIR / voice_type + audio = sample["audio"] + out_path = out_dir / f"libritts_{speaker}_{counts[voice_type]}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + counts[voice_type] += 1 + print(f" {voice_type}: {counts[voice_type]}/{max_per_type}", end="\r") + + if all(c >= max_per_type for c in counts.values()): + break + + print(f"\nLibriTTS complete: {counts}") + +def download_dialects(): + """Download English dialect samples.""" + print("\n=== Downloading English Dialects (Accents) ===") + + out_dir = DATASETS_DIR / "accent_non_native" + out_dir.mkdir(parents=True, exist_ok=True) + + count = 0 + max_samples = 150 + configs = ["scottish_male", "scottish_female", "irish_male", "welsh_male", "welsh_female", "northern_male"] + + for config in configs: + if count >= max_samples: + break + try: + ds = load_dataset("ylacombe/english_dialects", config, split="train", streaming=True) + for sample in ds: + if count >= max_samples: + break + audio = sample["audio"] + out_path = out_dir / f"dialect_{config}_{count}.wav" + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + count += 1 + print(f" accent_non_native: {count}/{max_samples}", end="\r") + except Exception as e: + print(f" Error with {config}: {e}") + + print(f"\nDialects complete: {count} samples") + +def download_genshin(): + """Download Genshin voices for anime style.""" + print("\n=== Downloading Genshin Voices (Anime Style) ===") + + ds = load_dataset("simon3000/genshin-voice", split="train", streaming=True) + + out_dir = DATASETS_DIR / "anime_airy" + out_dir.mkdir(parents=True, exist_ok=True) + + count = 0 + max_samples = 150 + target_chars = ["paimon", "barbara", "kokomi", "nahida", "klee", "qiqi", "diona"] + + for sample in ds: + speaker = str(sample.get("speaker", "")).lower() + lang = sample.get("language", "") + + if lang != "en": + continue + + if any(char in speaker for char in target_chars) and count < max_samples: + audio = sample["audio"] + clean_speaker = speaker.replace(" ", "_").replace("/", "_") + out_path = out_dir / f"genshin_{clean_speaker}_{count}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + count += 1 + print(f" anime_airy: {count}/{max_samples}", end="\r") + + if count >= max_samples: + break + + print(f"\nGenshin complete: {count} samples") + +def download_hifi(): + """Download Hi-Fi TTS for singing voices.""" + print("\n=== Downloading Hi-Fi TTS (Singing/High Quality) ===") + + ds = load_dataset("MikhailT/hifi-tts", "clean", split="train", streaming=True) + + # Map speakers to voice types + speaker_map = { + "92": "singing_male", + "6097": "singing_male", + "6670": "singing_female", + "6671": "singing_female", + } + + counts = {} + for vt in set(speaker_map.values()): + counts[vt] = 0 + (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True) + + max_per_type = 100 + + for sample in ds: + speaker = str(sample.get("speaker", "")) + + if speaker in speaker_map: + voice_type = speaker_map[speaker] + + if counts[voice_type] < max_per_type: + out_dir = DATASETS_DIR / voice_type + audio = sample["audio"] + out_path = out_dir / f"hifi_{speaker}_{counts[voice_type]}.wav" + + sf.write(str(out_path), audio["array"], audio["sampling_rate"]) + counts[voice_type] += 1 + print(f" {voice_type}: {counts[voice_type]}/{max_per_type}", end="\r") + + if all(c >= max_per_type for c in counts.values()): + break + + print(f"\nHi-Fi TTS complete: {counts}") + +def print_summary(): + """Print download summary.""" + print("\n" + "=" * 50) + print("DOWNLOAD SUMMARY") + print("=" * 50) + + voice_types = [ + "male_low", "male_mid", "female_low", "female_high", + "anime_airy", "accent_non_native", "singing_male", "singing_female", + "child", "elderly" + ] + + total = 0 + for vt in voice_types: + vt_dir = DATASETS_DIR / vt + if vt_dir.exists(): + files = list(vt_dir.glob("*.wav")) + count = len(files) + total += count + status = "✓" if count > 0 else "✗" + print(f" {status} {vt}: {count} files") + else: + print(f" ✗ {vt}: 0 files") + + print(f"\nTotal: {total} audio files downloaded") + print("\nNote: 'child' and 'elderly' need manual data - not available in these datasets.") + +def main(): + print("=" * 50) + print("RVC Voice Dataset Downloader") + print("=" * 50) + print(f"Output: {DATASETS_DIR}") + + # Create all directories + for vt in ["male_low", "male_mid", "female_low", "female_high", + "anime_airy", "accent_non_native", "singing_male", "singing_female", + "child", "elderly"]: + (DATASETS_DIR / vt).mkdir(parents=True, exist_ok=True) + + # Download each dataset + try: + download_libritts() + except Exception as e: + print(f"Error downloading LibriTTS: {e}") + + try: + download_dialects() + except Exception as e: + print(f"Error downloading Dialects: {e}") + + try: + download_genshin() + except Exception as e: + print(f"Error downloading Genshin: {e}") + + try: + download_hifi() + except Exception as e: + print(f"Error downloading Hi-Fi TTS: {e}") + + print_summary() + + print("\n" + "=" * 50) + print("NEXT STEPS") + print("=" * 50) + print("1. Review downloaded files in datasets/ folder") + print("2. Train models: python tools/train_batch.py --voice male_low") + print("3. Run experiments: python tools/run_experiments_batch.py") + +if __name__ == "__main__": + main() diff --git a/tools/run_experiments_batch.py b/tools/run_experiments_batch.py new file mode 100644 index 000000000..b1cd65b26 --- /dev/null +++ b/tools/run_experiments_batch.py @@ -0,0 +1,93 @@ +import os +import subprocess +import sys + +# Add root to path +now_dir = os.getcwd() +sys.path.append(now_dir) + +def main(): + # List of voices as defined in Task 7/10 + voices = [ + 'male_low', 'male_mid', 'female_low', 'female_high', + 'anime_airy', 'accent_non_native', 'singing_male', 'singing_female', + 'child', 'elderly' + ] + + # Base paths + weights_dir = os.path.join(now_dir, "assets", "weights") + datasets_dir = os.path.join(now_dir, "datasets") + experiments_dir = os.path.join(now_dir, "experiments") + + # Path to test_grid.py + test_grid_script = os.path.join(now_dir, "tools", "test_grid.py") + + print(f"Starting Batch Experiments for {len(voices)} voices...") + + for voice in voices: + print(f"\n--- Processing Voice: {voice} ---") + + # Check for model + model_name = f"{voice}.pth" + model_path = os.path.join(weights_dir, model_name) + + if not os.path.exists(model_path): + print(f"Skipping {voice}: Model not found at {model_path}") + continue + + # Check for test audio + # We need a reference audio to run inference on. + # Ideally, we should have a 'test_samples' folder or use a file from the dataset itself (held out). + # For now, let's look for a 'test.wav' in the voice's dataset folder, or a global test file. + + # Strategy: Look for 'test.wav' in dataset dir, else take the first wav file found. + voice_dataset_dir = os.path.join(datasets_dir, voice) + input_audio = None + + if os.path.exists(voice_dataset_dir): + potential_files = [f for f in os.listdir(voice_dataset_dir) if f.endswith(".wav")] + if "test.wav" in potential_files: + input_audio = os.path.join(voice_dataset_dir, "test.wav") + elif len(potential_files) > 0: + input_audio = os.path.join(voice_dataset_dir, potential_files[0]) + + if not input_audio: + print(f"Skipping {voice}: No input audio found in {voice_dataset_dir}") + continue + + # Check for index file (optional but recommended) + # Usually located in logs/{voice}/added_*.index + # We need to find it. + logs_dir = os.path.join(now_dir, "logs", voice) + index_path = "" + if os.path.exists(logs_dir): + for f in os.listdir(logs_dir): + if f.startswith("added_") and f.endswith(".index"): + index_path = os.path.join(logs_dir, f) + break + + print(f"Model: {model_name}") + print(f"Input: {input_audio}") + print(f"Index: {index_path if index_path else 'None'}") + + # Run test_grid.py + cmd = [ + sys.executable, test_grid_script, + "--model_name", model_name, + "--input_path", input_audio, + "--output_dir", experiments_dir + ] + + if index_path: + cmd.extend(["--index_path", index_path]) + + try: + subprocess.run(cmd, check=True) + print(f"Successfully ran experiments for {voice}") + except subprocess.CalledProcessError as e: + print(f"Error running experiments for {voice}: {e}") + + print("\nBatch Experiments Completed.") + +if __name__ == "__main__": + main() diff --git a/tools/test_grid.py b/tools/test_grid.py new file mode 100644 index 000000000..7f8728289 --- /dev/null +++ b/tools/test_grid.py @@ -0,0 +1,131 @@ +import itertools +import argparse +import os +import sys +import json +import time +from scipy.io import wavfile + +# Add root to path +now_dir = os.getcwd() +sys.path.append(now_dir) + +from dotenv import load_dotenv +from configs.config import Config +from infer.modules.vc.modules import VC + +def main(): + parser = argparse.ArgumentParser(description="Run RVC inference across a parameter grid.") + parser.add_argument("--model_name", required=True, help="Name of the model (must be in assets/weights)") + parser.add_argument("--input_path", required=True, help="Path to reference audio file") + parser.add_argument("--index_path", default="", help="Path to .index file") + parser.add_argument("--output_dir", default="experiments", help="Base directory for output") + parser.add_argument("--f0up_key", type=int, default=0, help="Pitch shift (semitones)") + + args = parser.parse_args() + + # Load config and VC + load_dotenv() + config = Config() + vc = VC(config) + vc.get_vc(args.model_name) + + # Define Grid + # You can modify this grid in the code or make it configurable via JSON later + grid = { + "f0method": ["rmvpe", "pm"], # "harvest", "crepe" are slower + "index_rate": [0.0, 0.5, 0.75, 1.0], + "filter_radius": [3], + "rms_mix_rate": [0.25, 1.0], + "protect": [0.33], + "resample_sr": [0], # 0 means no resampling + } + + # Prepare output directory + model_slug = os.path.splitext(args.model_name)[0] + audio_slug = os.path.splitext(os.path.basename(args.input_path))[0] + timestamp = int(time.time()) + experiment_dir = os.path.join(args.output_dir, model_slug, audio_slug, str(timestamp)) + os.makedirs(experiment_dir, exist_ok=True) + + print(f"Starting Grid Search Experiment") + print(f"Model: {args.model_name}") + print(f"Input: {args.input_path}") + print(f"Output: {experiment_dir}") + + # Generate combinations + keys = grid.keys() + values = grid.values() + combinations = list(itertools.product(*values)) + + results = [] + + total = len(combinations) + print(f"Total combinations to run: {total}") + + for i, combo in enumerate(combinations): + params = dict(zip(keys, combo)) + print(f"[{i+1}/{total}] Running with {params}") + + # Construct output filename + # e.g. rmvpe_idx0.5_rms1.0.wav + filename_parts = [f"{k}{v}" for k, v in params.items()] + filename = "_".join(filename_parts) + ".wav" + output_path = os.path.join(experiment_dir, filename) + + # Run Inference + try: + info, opt = vc.vc_single( + 0, # sid + args.input_path, + args.f0up_key, + None, # f0_file + params["f0method"], + args.index_path, + None, # file_index2 + params["index_rate"], + params["filter_radius"], + params["resample_sr"], + params["rms_mix_rate"], + params["protect"] + ) + + if "Success" in info: + tgt_sr, audio_opt = opt + wavfile.write(output_path, tgt_sr, audio_opt) + results.append({ + "params": params, + "output_file": filename, + "status": "success" + }) + else: + print(f"Error: {info}") + results.append({ + "params": params, + "status": "failed", + "error": info + }) + + except Exception as e: + print(f"Exception: {e}") + results.append({ + "params": params, + "status": "error", + "error": str(e) + }) + + # Save metadata + metadata_path = os.path.join(experiment_dir, "metadata.json") + with open(metadata_path, "w") as f: + json.dump({ + "model": args.model_name, + "input_path": args.input_path, + "f0up_key": args.f0up_key, + "grid": grid, + "results": results + }, f, indent=2) + + print(f"Experiment completed. Results saved to {experiment_dir}") + +if __name__ == "__main__": + main() diff --git a/tools/train_batch.py b/tools/train_batch.py new file mode 100644 index 000000000..90604efb2 --- /dev/null +++ b/tools/train_batch.py @@ -0,0 +1,194 @@ +import os +import sys +import time +import json +import argparse +import subprocess +from pathlib import Path + +# Add project root to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from configs.config import Config + +def run_command(cmd, cwd=None): + print(f"Running: {cmd}") + process = subprocess.Popen( + cmd, + shell=True, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True + ) + + # Stream output + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + print(output.strip()) + + rc = process.poll() + return rc + +def train_voice_model(voice_name, dataset_path, epochs=50, batch_size=8, sample_rate="40k", version="v2", gpu_id="0"): + """ + Automates the RVC training pipeline for a single voice model. + """ + print(f"\n{'='*50}") + print(f"Starting training for: {voice_name}") + print(f"{'='*50}\n") + + root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + logs_dir = os.path.join(root_dir, "logs", voice_name) + + # 1. Preprocessing + print("\n[Step 1/4] Preprocessing Data...") + cmd_preprocess = f"python infer/modules/train/preprocess.py \"{dataset_path}\" {sample_rate.replace('k','000')} 2 \"{logs_dir}\" False 3.0" + if run_command(cmd_preprocess, cwd=root_dir) != 0: + print("Error in preprocessing") + return False + + # 2. Feature Extraction + print("\n[Step 2/4] Extracting Features...") + # F0 extraction (rmvpe_gpu) + cmd_f0 = f"python infer/modules/train/extract/extract_f0_rmvpe.py 1 0 0 \"{logs_dir}\" True" + if run_command(cmd_f0, cwd=root_dir) != 0: + print("Error in F0 extraction") + return False + + # Feature extraction (HuBERT) + cmd_feat = f"python infer/modules/train/extract_feature_print.py {gpu_id} 1 0 0 \"{logs_dir}\" {version} False" + if run_command(cmd_feat, cwd=root_dir) != 0: + print("Error in feature extraction") + return False + + # 3. Training Model + print("\n[Step 3/4] Training Model...") + # Determine pretrained models + if version == "v1": + pg = f"assets/pretrained/f0G{sample_rate}.pth" + pd = f"assets/pretrained/f0D{sample_rate}.pth" + else: + pg = f"assets/pretrained_v2/f0G{sample_rate}.pth" + pd = f"assets/pretrained_v2/f0D{sample_rate}.pth" + + cmd_train = ( + f"python infer/modules/train/train.py -e \"{voice_name}\" -sr {sample_rate} -f0 1 -bs {batch_size} " + f"-g {gpu_id} -te {epochs} -se 10 -pg \"{pg}\" -pd \"{pd}\" -l 0 -c 0 -sw 1 -v {version}" + ) + + if run_command(cmd_train, cwd=root_dir) != 0: + print("Error in training") + return False + + # 4. Training Index + print("\n[Step 4/4] Training Index...") + + index_script = f""" +import sys +import os +import numpy as np +import faiss +from sklearn.cluster import MiniBatchKMeans + +exp_dir = "{logs_dir}" +version = "{version}" +feature_dir = os.path.join(exp_dir, "3_feature256" if version == "v1" else "3_feature768") + +if not os.path.exists(feature_dir): + print("Feature dir not found") + sys.exit(1) + +listdir_res = list(os.listdir(feature_dir)) +if len(listdir_res) == 0: + print("No features found") + sys.exit(1) + +npys = [] +for name in sorted(listdir_res): + phone = np.load(os.path.join(feature_dir, name)) + npys.append(phone) + +big_npy = np.concatenate(npys, 0) +big_npy_idx = np.arange(big_npy.shape[0]) +np.random.shuffle(big_npy_idx) +big_npy = big_npy[big_npy_idx] + +if big_npy.shape[0] > 2e5: + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + batch_size=256 * 8, + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + +np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy) +n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) +index = faiss.index_factory(256 if version == "v1" else 768, "IVF%s,Flat" % n_ivf) +index_ivf = faiss.extract_index_ivf(index) +index_ivf.nprobe = 1 +index.train(big_npy) +faiss.write_index( + index, + os.path.join(exp_dir, f"trained_IVF{{n_ivf}}_Flat_nprobe_1_{voice_name}_{version}.index") +) + +batch_size_add = 8192 +for i in range(0, big_npy.shape[0], batch_size_add): + index.add(big_npy[i : i + batch_size_add]) + +faiss.write_index( + index, + os.path.join(exp_dir, f"added_IVF{{n_ivf}}_Flat_nprobe_1_{voice_name}_{version}.index") +) +print("Index training complete") +""" + + # Write temp script + with open("temp_index_train.py", "w") as f: + f.write(index_script) + + if run_command("python temp_index_train.py", cwd=root_dir) != 0: + print("Error in index training") + os.remove("temp_index_train.py") + return False + + os.remove("temp_index_train.py") + print(f"\nSuccessfully trained model for {voice_name}!") + return True + +def main(): + parser = argparse.ArgumentParser(description="Batch Train RVC Models") + parser.add_argument("--voice", type=str, help="Specific voice name to train (folder name in datasets/)") + parser.add_argument("--epochs", type=int, default=50, help="Number of epochs") + parser.add_argument("--batch_size", type=int, default=8, help="Batch size") + + args = parser.parse_args() + + datasets_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "datasets") + + if args.voice: + voices = [args.voice] + else: + voices = [d for d in os.listdir(datasets_dir) if os.path.isdir(os.path.join(datasets_dir, d))] + + print(f"Found {len(voices)} voices to train: {voices}") + + for voice in voices: + dataset_path = os.path.join(datasets_dir, voice) + # Check if dataset has files + if not os.path.exists(dataset_path) or not os.listdir(dataset_path): + print(f"Skipping {voice} - no data found") + continue + + train_voice_model(voice, dataset_path, epochs=args.epochs, batch_size=args.batch_size) + +if __name__ == "__main__": + main()