diff --git a/rvc/lib/tools/model_download.py b/rvc/lib/tools/model_download.py index e5c5126..ab1b136 100644 --- a/rvc/lib/tools/model_download.py +++ b/rvc/lib/tools/model_download.py @@ -380,6 +380,6 @@ def model_download_pipeline(url: str): return result else: return "Error" - except Exception as e: - print(f"An unexpected error occurred: {e}") + except Exception as error: + print(f"An unexpected error occurred: {error}") return "Error" diff --git a/rvc/train/extract/extract_f0_print.py b/rvc/train/extract/extract_f0_print.py index 285797a..b647a5b 100644 --- a/rvc/train/extract/extract_f0_print.py +++ b/rvc/train/extract/extract_f0_print.py @@ -19,12 +19,15 @@ f0_method = str(sys.argv[2]) hop_length = int(sys.argv[3]) num_processes = int(sys.argv[4]) +gpus = str(sys.argv[5]) # - = Use CPU + +os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") class FeatureInput: """Class for F0 extraction.""" - def __init__(self, sample_rate=16000, hop_size=160): + def __init__(self, sample_rate=16000, hop_size=160, device="cpu"): self.fs = sample_rate self.hop = hop_size self.f0_bin = 256 @@ -32,10 +35,11 @@ def __init__(self, sample_rate=16000, hop_size=160): self.f0_min = 50.0 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = device self.model_rmvpe = RMVPE0Predictor( os.path.join("rvc", "models", "predictors", "rmvpe.pt"), is_half=False, - device="cpu", + device=device, ) def compute_f0(self, np_arr, f0_method, hop_length): @@ -53,7 +57,7 @@ def compute_f0(self, np_arr, f0_method, hop_length): def get_crepe(self, x, p_len, hop_length): """Extract F0 using CREPE.""" - audio = torch.from_numpy(x.astype(np.float32)).to("cpu") + audio = torch.from_numpy(x.astype(np.float32)).to(self.device) audio /= torch.quantile(torch.abs(audio), 0.999) audio = torch.unsqueeze(audio, dim=0) @@ -65,7 +69,7 @@ def get_crepe(self, x, p_len, hop_length): self.f0_max, "full", batch_size=hop_length * 2, - device="cpu", + device=self.device, pad=True, ) @@ -108,9 +112,14 @@ def process_file(self, file_info, f0_method, hop_length): except Exception as error: print(f"An error occurred extracting file {inp_path}: {error}") + def process_files(self, files, f0_method, hop_length, pbar): + """Process multiple files.""" + for file_info in files: + self.process_file(file_info, f0_method, hop_length) + pbar.update() + -def main(exp_dir, f0_method, hop_length, num_processes): - feature_input = FeatureInput() +def main(exp_dir, f0_method, hop_length, num_processes, gpus): paths = [] input_root = os.path.join(exp_dir, "sliced_audios_16k") output_root1 = os.path.join(exp_dir, "f0") @@ -125,27 +134,60 @@ def main(exp_dir, f0_method, hop_length, num_processes): input_path = os.path.join(input_root, name) output_path1 = os.path.join(output_root1, name) output_path2 = os.path.join(output_root2, name) - np_arr = load_audio(input_path, 16000) # self.fs? + np_arr = load_audio(input_path, 16000) paths.append([input_path, output_path1, output_path2, np_arr]) print(f"Starting extraction with {num_processes} cores and {f0_method}...") start_time = time.time() - # Use multiprocessing Pool for parallel processing with progress bar - with tqdm.tqdm(total=len(paths), desc="F0 Extraction") as pbar: - pool = Pool(processes=num_processes) - process_file_partial = partial( - feature_input.process_file, f0_method=f0_method, hop_length=hop_length - ) - for _ in pool.imap_unordered(process_file_partial, paths): - pbar.update() - pool.close() - pool.join() + if gpus != "-": + gpus = gpus.split("-") + num_gpus = len(gpus) + process_partials = [] + pbar = tqdm.tqdm(total=len(paths), desc="F0 Extraction") + + for idx, gpu in enumerate(gpus): + device = f"cuda:{gpu}" + if torch.cuda.is_available() and torch.cuda.device_count() > idx: + try: + feature_input = FeatureInput(device=device) + part_paths = paths[idx::num_gpus] + process_partials.append((feature_input, part_paths)) + except Exception as error: + print( + f"Oops, there was an issue initializing GPU {device} ({error}). Maybe you don't have a GPU? No worries, switching to CPU for now." + ) + feature_input = FeatureInput(device="cpu") + part_paths = paths[idx::num_gpus] + process_partials.append((feature_input, part_paths)) + else: + print(f"GPU {device} is not available. Switching to CPU.") + feature_input = FeatureInput(device="cpu") + part_paths = paths[idx::num_gpus] + process_partials.append((feature_input, part_paths)) + + # Process each part with the corresponding GPU or CPU + for feature_input, part_paths in process_partials: + feature_input.process_files(part_paths, f0_method, hop_length, pbar) + pbar.close() + + else: + # Use multiprocessing Pool for parallel processing with progress bar + feature_input = FeatureInput(device="cpu") + with tqdm.tqdm(total=len(paths), desc="F0 Extraction") as pbar: + pool = Pool(processes=num_processes) + process_file_partial = partial( + feature_input.process_file, f0_method=f0_method, hop_length=hop_length + ) + for _ in pool.imap_unordered(process_file_partial, paths): + pbar.update() + pool.close() + pool.join() elapsed_time = time.time() - start_time print(f"F0 extraction completed in {elapsed_time:.2f} seconds.") if __name__ == "__main__": - main(exp_dir, f0_method, hop_length, num_processes) + main(exp_dir, f0_method, hop_length, num_processes, gpus) diff --git a/rvc/train/extract/extract_feature_print.py b/rvc/train/extract/extract_feature_print.py index 8f9262f..13fd751 100644 --- a/rvc/train/extract/extract_feature_print.py +++ b/rvc/train/extract/extract_feature_print.py @@ -9,98 +9,160 @@ now_dir = os.getcwd() sys.path.append(now_dir) -from rvc.lib.utils import load_embedding -# Parse command line arguments -device = str(sys.argv[1]) -n_parts = int(sys.argv[2]) -i_part = int(sys.argv[3]) -i_gpu = int(sys.argv[4]) -exp_dir = str(sys.argv[5]) -version = str(sys.argv[6]) -is_half = bool(sys.argv[7]) -embedder_model = str(sys.argv[8]) +from rvc.lib.utils import load_embedding +from rvc.configs.config import Config -try: - embedder_model_custom = str(sys.argv[9]) -except: - embedder_model_custom = None +config = Config() -os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) -wav_path = f"{exp_dir}/sliced_audios_16k" -out_path = f"{exp_dir}/v1_extracted" if version == "v1" else f"{exp_dir}/v2_extracted" -os.makedirs(out_path, exist_ok=True) +def setup_paths(exp_dir: str, version: str): + """Set up input and output paths.""" + wav_path = os.path.join(exp_dir, "sliced_audios_16k") + out_path = os.path.join( + exp_dir, "v1_extracted" if version == "v1" else "v2_extracted" + ) + os.makedirs(out_path, exist_ok=True) + return wav_path, out_path -def read_wave(wav_path, normalize=False): +def read_wave(wav_path: str, normalize: bool = False): + """Read a wave file and return its features.""" wav, sr = sf.read(wav_path) - assert sr == 16000 + assert sr == 16000, "Sample rate must be 16000" + feats = torch.from_numpy(wav) - feats = feats.half() if is_half else feats.float() + feats = feats.half() if config.is_half else feats.float() feats = feats.mean(-1) if feats.dim() == 2 else feats feats = feats.view(1, -1) + if normalize: with torch.no_grad(): feats = F.layer_norm(feats, feats.shape) return feats -print("Starting feature extraction...") - -start_time = time.time() - - -models, saved_cfg, task = load_embedding(embedder_model, embedder_model_custom) -model = models[0] -model = model.to(device) -if device not in ["mps", "cpu"]: - model = model.half() -model.eval() - -todo = sorted(os.listdir(wav_path))[i_part::n_parts] -n = max(1, len(todo) // 10) - -if len(todo) == 0: - print( - "An error occurred in the feature extraction, make sure you have provided the audios correctly." - ) -else: - # print(f"{len(todo)}") - with tqdm.tqdm(total=len(todo)) as pbar: - for idx, file in enumerate(todo): - try: - if file.endswith(".wav"): - wav_file_path = os.path.join(wav_path, file) - out_file_path = os.path.join(out_path, file.replace("wav", "npy")) - - if os.path.exists(out_file_path): - continue - - feats = read_wave(wav_file_path, normalize=saved_cfg.task.normalize) - padding_mask = torch.BoolTensor(feats.shape).fill_(False) - inputs = { - "source": feats.to(device), - "padding_mask": padding_mask.to(device), - "output_layer": 9 if version == "v1" else 12, - } - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = ( - model.final_proj(logits[0]) - if version == "v1" - else logits[0] - ) - - feats = feats.squeeze(0).float().cpu().numpy() - if np.isnan(feats).sum() == 0: - np.save(out_file_path, feats, allow_pickle=False) - else: - print(f"{file} is invalid") - pbar.set_description(f"Processing {file} {feats.shape}") - except Exception as error: - print(f"An error occurred processing {file}: {error}") - pbar.update(1) - +def process_file( + file: str, + wav_path: str, + out_path: str, + model: torch.nn.Module, + device: str, + version: str, + saved_cfg: Config, +): + """Process a single audio file.""" + wav_file_path = os.path.join(wav_path, file) + out_file_path = os.path.join(out_path, file.replace("wav", "npy")) + + if os.path.exists(out_file_path): + return + + # Load and prepare features + feats = read_wave(wav_file_path, normalize=saved_cfg.task.normalize) + + # Adjust dtype based on the device + dtype = torch.float16 if device.startswith("cuda") else torch.float32 + feats = feats.to(dtype).to(device) + + padding_mask = torch.BoolTensor(feats.shape).fill_(False).to(dtype).to(device) + + inputs = { + "source": feats, + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + + with torch.no_grad(): + model = model.to(device).to(dtype) + + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + + feats = feats.squeeze(0).float().cpu().numpy() + if not np.isnan(feats).any(): + np.save(out_file_path, feats, allow_pickle=False) + else: + print(f"{file} contains NaN values and will be skipped.") + + +def main(): + """Main function to orchestrate the feature extraction process.""" + try: + exp_dir = str(sys.argv[1]) + version = str(sys.argv[2]) + gpus = str(sys.argv[3]) + embedder_model = str(sys.argv[4]) + embedder_model_custom = str(sys.argv[5]) if len(sys.argv) > 5 else None + + os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") + except IndexError: + print("Invalid arguments provided.") + sys.exit(1) + + wav_path, out_path = setup_paths(exp_dir, version) + + print("Starting feature extraction...") + start_time = time.time() + + models, saved_cfg, task = load_embedding(embedder_model, embedder_model_custom) + model = models[0] + + gpus = gpus.split("-") if gpus != "-" else ["cpu"] + + devices = [] + for gpu in gpus: + try: + if gpu != "cpu": + index = int(gpu) + if index < torch.cuda.device_count(): + devices.append(f"cuda:{index}") + else: + print( + f"Oops, there was an issue initializing GPU. Maybe you don't have a GPU? No worries, switching to CPU for now." + ) + devices.append("cpu") + else: + devices.append("cpu") + except ValueError: + f"Oops, there was an issue initializing GPU. Maybe you don't have a GPU? No worries, switching to CPU for now." + devices.append("cpu") + + paths = sorted(os.listdir(wav_path)) + if not paths: + print("No audio files found. Make sure you have provided the audios correctly.") + sys.exit(1) + + pbar = tqdm.tqdm(total=len(paths), desc="Feature Extraction") + + # Create a list of tasks to be processed + tasks = [ + ( + file, + wav_path, + out_path, + model, + device, + version, + saved_cfg, + ) + for file in paths + if file.endswith(".wav") + for device in devices + ] + + # Process files + for task in tasks: + try: + process_file(*task) + except Exception as error: + print(f"An error occurred processing {task[0]}: {error}") + pbar.update(1) + + pbar.close() elapsed_time = time.time() - start_time print(f"Feature extraction completed in {elapsed_time:.2f} seconds.") + + +if __name__ == "__main__": + main() diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index 10b8a4a..4803b60 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -1,13 +1,11 @@ -from multiprocessing import cpu_count import os import sys import time -from typing import List, Tuple -import multiprocessing -from scipy import signal -from scipy.io import wavfile import librosa import numpy as np +from scipy import signal +from scipy.io import wavfile +from multiprocessing import cpu_count, Pool now_directory = os.getcwd() sys.path.append(now_directory) @@ -28,15 +26,11 @@ MAX_AMPLITUDE = 0.9 ALPHA = 0.75 HIGH_PASS_CUTOFF = 48 +SAMPLE_RATE_16K = 16000 # Define directory paths -GT_WAVS_DIR = f"{experiment_directory}/sliced_audios" -WAVS16K_DIR = f"{experiment_directory}/sliced_audios_16k" - -# Create directories if they don't exist -os.makedirs(experiment_directory, exist_ok=True) -os.makedirs(GT_WAVS_DIR, exist_ok=True) -os.makedirs(WAVS16K_DIR, exist_ok=True) +GT_WAVS_DIR = os.path.join(experiment_directory, "sliced_audios") +WAVS16K_DIR = os.path.join(experiment_directory, "sliced_audios_16k") class PreProcess: @@ -56,27 +50,37 @@ def __init__(self, sr: int, exp_dir: str, per: float): self.per = per self.exp_dir = exp_dir - def normalize_and_write(self, tmp_audio: np.ndarray, idx0: int, idx1: int): - tmp_max = np.abs(tmp_audio).max() + def _normalize_audio(self, audio: np.ndarray): + """Normalizes the audio to the desired amplitude.""" + tmp_max = np.abs(audio).max() if tmp_max > 2.5: - print(f"{idx0}-{idx1}-{tmp_max}-filtered") + return None # Indicate audio should be filtered out + return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio + + def _write_audio(self, audio: np.ndarray, filename: str, sr: int): + """Writes the audio to a WAV file.""" + wavfile.write(filename, sr, audio.astype(np.float32)) + + def process_audio_segment(self, audio_segment: np.ndarray, idx0: int, idx1: int): + """Processes a single audio segment.""" + normalized_audio = self._normalize_audio(audio_segment) + if normalized_audio is None: + print(f"{idx0}-{idx1}-filtered") return - tmp_audio = (tmp_audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + ( - 1 - ALPHA - ) * tmp_audio - wavfile.write( - f"{GT_WAVS_DIR}/{idx0}_{idx1}.wav", - self.sr, - tmp_audio.astype(np.float32), - ) - tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) - wavfile.write( - f"{WAVS16K_DIR}/{idx0}_{idx1}.wav", - 16000, - tmp_audio.astype(np.float32), + + # Write original sample rate audio + gt_wav_path = os.path.join(GT_WAVS_DIR, f"{idx0}_{idx1}.wav") + self._write_audio(normalized_audio, gt_wav_path, self.sr) + + # Resample and write 16kHz audio + audio_16k = librosa.resample( + normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K ) + wav_16k_path = os.path.join(WAVS16K_DIR, f"{idx0}_{idx1}.wav") + self._write_audio(audio_16k, wav_16k_path, SAMPLE_RATE_16K) def process_audio(self, path: str, idx0: int): + """Processes a single audio file.""" try: audio = load_audio(path, self.sr) audio = signal.lfilter(self.b_high, self.a_high, audio) @@ -91,35 +95,37 @@ def process_audio(self, path: str, idx0: int): tmp_audio = audio_segment[ start : start + int(self.per * self.sr) ] - self.normalize_and_write(tmp_audio, idx0, idx1) + self.process_audio_segment(tmp_audio, idx0, idx1) idx1 += 1 else: tmp_audio = audio_segment[start:] + self.process_audio_segment(tmp_audio, idx0, idx1) idx1 += 1 break - self.normalize_and_write(tmp_audio, idx0, idx1) except Exception as error: print(f"An error occurred on {path} path: {error}") - def process_audio_multiprocessing(self, infos: List[Tuple[str, int]]): - for path, idx0 in infos: - self.process_audio(path, idx0) + def process_audio_file(self, file_path_idx): + file_path, idx0 = file_path_idx + self.process_audio(file_path, idx0) def process_audio_multiprocessing_input_directory( self, input_root: str, num_processes: int ): - try: - infos = [ - (f"{input_root}/{name}", idx) - for idx, name in enumerate(sorted(list(os.listdir(input_root)))) - ] - with multiprocessing.Pool(processes=num_processes) as pool: - pool.map( - self.process_audio_multiprocessing, - [infos[i::num_processes] for i in range(num_processes)], - ) - except Exception as error: - print(f"An error occurred on {input_root} path: {error}") + # Get list of files + files = [ + (os.path.join(input_root, f), idx) + for idx, f in enumerate(os.listdir(input_root)) + if f.endswith(".wav") + ] + + # Create the directories if they don't exist + os.makedirs(GT_WAVS_DIR, exist_ok=True) + os.makedirs(WAVS16K_DIR, exist_ok=True) + + # Use multiprocessing to process files + with Pool(processes=num_processes) as pool: + pool.map(self.process_audio_file, files) def preprocess_training_set( diff --git a/rvc/train/train.py b/rvc/train/train.py index 07f3f39..52d8618 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -123,7 +123,8 @@ def main(): """ Main function to start the training process. """ - + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) def start(): """ Starts the training process with multi-GPU support. @@ -267,8 +268,6 @@ def run( writer = SummaryWriter(log_dir=experiment_dir) writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval")) - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(randint(20000, 55555)) dist.init_process_group( backend="gloo", init_method="env://", world_size=n_gpus, rank=rank ) diff --git a/rvc_cli.py b/rvc_cli.py index b9067f4..5c184ff 100644 --- a/rvc_cli.py +++ b/rvc_cli.py @@ -199,12 +199,17 @@ def run_tts_script( os.remove(output_tts_path) command_tts = [ - python, - tts_script_path, - tts_text, - tts_voice, - tts_rate, - output_tts_path, + *map( + str, + [ + python, + tts_script_path, + tts_text, + tts_voice, + tts_rate, + output_tts_path, + ], + ), ] subprocess.run(command_tts) infer_pipeline = import_voice_converter() @@ -270,6 +275,7 @@ def run_extract_script( pitch_guidance: bool, hop_length: int, cpu_cores: int, + gpu: int, sample_rate: int, embedder_model: str, embedder_model_custom: str = None, @@ -293,6 +299,7 @@ def run_extract_script( f0_method, hop_length, cpu_cores, + gpu, ], ), ] @@ -303,13 +310,9 @@ def run_extract_script( *map( str, [ - config.device, - 1, - 0, - 0, model_path, rvc_version, - config.is_half, + gpu, embedder_model, embedder_model_custom, ], @@ -1035,6 +1038,12 @@ def parse_arguments(): choices=range(1, 65), default=None, ) + extract_parser.add_argument( + "--gpu", + type=int, + help="GPU device to use for feature extraction (optional).", + default="-", + ) extract_parser.add_argument( "--sample_rate", type=int, @@ -1441,6 +1450,7 @@ def main(): pitch_guidance=args.pitch_guidance, hop_length=args.hop_length, cpu_cores=args.cpu_cores, + gpu=args.gpu, sample_rate=args.sample_rate, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom,