diff --git a/rvc/lib/tools/model_download.py b/rvc/lib/tools/model_download.py
index e5c5126..ab1b136 100644
--- a/rvc/lib/tools/model_download.py
+++ b/rvc/lib/tools/model_download.py
@@ -380,6 +380,6 @@ def model_download_pipeline(url: str):
             return result
         else:
             return "Error"
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
+    except Exception as error:
+        print(f"An unexpected error occurred: {error}")
         return "Error"
diff --git a/rvc/train/extract/extract_f0_print.py b/rvc/train/extract/extract_f0_print.py
index 285797a..b647a5b 100644
--- a/rvc/train/extract/extract_f0_print.py
+++ b/rvc/train/extract/extract_f0_print.py
@@ -19,12 +19,15 @@
 f0_method = str(sys.argv[2])
 hop_length = int(sys.argv[3])
 num_processes = int(sys.argv[4])
+gpus = str(sys.argv[5])  # - = Use CPU
+
+os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",")
 
 
 class FeatureInput:
     """Class for F0 extraction."""
 
-    def __init__(self, sample_rate=16000, hop_size=160):
+    def __init__(self, sample_rate=16000, hop_size=160, device="cpu"):
         self.fs = sample_rate
         self.hop = hop_size
         self.f0_bin = 256
@@ -32,10 +35,11 @@ def __init__(self, sample_rate=16000, hop_size=160):
         self.f0_min = 50.0
         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+        self.device = device
         self.model_rmvpe = RMVPE0Predictor(
             os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
             is_half=False,
-            device="cpu",
+            device=device,
         )
 
     def compute_f0(self, np_arr, f0_method, hop_length):
@@ -53,7 +57,7 @@ def compute_f0(self, np_arr, f0_method, hop_length):
 
     def get_crepe(self, x, p_len, hop_length):
         """Extract F0 using CREPE."""
-        audio = torch.from_numpy(x.astype(np.float32)).to("cpu")
+        audio = torch.from_numpy(x.astype(np.float32)).to(self.device)
         audio /= torch.quantile(torch.abs(audio), 0.999)
         audio = torch.unsqueeze(audio, dim=0)
 
@@ -65,7 +69,7 @@ def get_crepe(self, x, p_len, hop_length):
             self.f0_max,
             "full",
             batch_size=hop_length * 2,
-            device="cpu",
+            device=self.device,
             pad=True,
         )
 
@@ -108,9 +112,14 @@ def process_file(self, file_info, f0_method, hop_length):
         except Exception as error:
             print(f"An error occurred extracting file {inp_path}: {error}")
 
+    def process_files(self, files, f0_method, hop_length, pbar):
+        """Process multiple files."""
+        for file_info in files:
+            self.process_file(file_info, f0_method, hop_length)
+            pbar.update()
+
 
-def main(exp_dir, f0_method, hop_length, num_processes):
-    feature_input = FeatureInput()
+def main(exp_dir, f0_method, hop_length, num_processes, gpus):
     paths = []
     input_root = os.path.join(exp_dir, "sliced_audios_16k")
     output_root1 = os.path.join(exp_dir, "f0")
@@ -125,27 +134,60 @@ def main(exp_dir, f0_method, hop_length, num_processes):
         input_path = os.path.join(input_root, name)
         output_path1 = os.path.join(output_root1, name)
         output_path2 = os.path.join(output_root2, name)
-        np_arr = load_audio(input_path, 16000)  # self.fs?
+        np_arr = load_audio(input_path, 16000)
         paths.append([input_path, output_path1, output_path2, np_arr])
 
     print(f"Starting extraction with {num_processes} cores and {f0_method}...")
 
     start_time = time.time()
 
-    # Use multiprocessing Pool for parallel processing with progress bar
-    with tqdm.tqdm(total=len(paths), desc="F0 Extraction") as pbar:
-        pool = Pool(processes=num_processes)
-        process_file_partial = partial(
-            feature_input.process_file, f0_method=f0_method, hop_length=hop_length
-        )
-        for _ in pool.imap_unordered(process_file_partial, paths):
-            pbar.update()
-        pool.close()
-        pool.join()
+    if gpus != "-":
+        gpus = gpus.split("-")
+        num_gpus = len(gpus)
+        process_partials = []
+        pbar = tqdm.tqdm(total=len(paths), desc="F0 Extraction")
+
+        for idx, gpu in enumerate(gpus):
+            device = f"cuda:{gpu}"
+            if torch.cuda.is_available() and torch.cuda.device_count() > idx:
+                try:
+                    feature_input = FeatureInput(device=device)
+                    part_paths = paths[idx::num_gpus]
+                    process_partials.append((feature_input, part_paths))
+                except Exception as error:
+                    print(
+                        f"Oops, there was an issue initializing GPU {device} ({error}). Maybe you don't have a GPU? No worries, switching to CPU for now."
+                    )
+                    feature_input = FeatureInput(device="cpu")
+                    part_paths = paths[idx::num_gpus]
+                    process_partials.append((feature_input, part_paths))
+            else:
+                print(f"GPU {device} is not available. Switching to CPU.")
+                feature_input = FeatureInput(device="cpu")
+                part_paths = paths[idx::num_gpus]
+                process_partials.append((feature_input, part_paths))
+
+        # Process each part with the corresponding GPU or CPU
+        for feature_input, part_paths in process_partials:
+            feature_input.process_files(part_paths, f0_method, hop_length, pbar)
+        pbar.close()
+
+    else:
+        # Use multiprocessing Pool for parallel processing with progress bar
+        feature_input = FeatureInput(device="cpu")
+        with tqdm.tqdm(total=len(paths), desc="F0 Extraction") as pbar:
+            pool = Pool(processes=num_processes)
+            process_file_partial = partial(
+                feature_input.process_file, f0_method=f0_method, hop_length=hop_length
+            )
+            for _ in pool.imap_unordered(process_file_partial, paths):
+                pbar.update()
+            pool.close()
+            pool.join()
 
     elapsed_time = time.time() - start_time
     print(f"F0 extraction completed in {elapsed_time:.2f} seconds.")
 
 
 if __name__ == "__main__":
-    main(exp_dir, f0_method, hop_length, num_processes)
+    main(exp_dir, f0_method, hop_length, num_processes, gpus)
diff --git a/rvc/train/extract/extract_feature_print.py b/rvc/train/extract/extract_feature_print.py
index 8f9262f..13fd751 100644
--- a/rvc/train/extract/extract_feature_print.py
+++ b/rvc/train/extract/extract_feature_print.py
@@ -9,98 +9,160 @@
 
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-from rvc.lib.utils import load_embedding
 
-# Parse command line arguments
-device = str(sys.argv[1])
-n_parts = int(sys.argv[2])
-i_part = int(sys.argv[3])
-i_gpu = int(sys.argv[4])
-exp_dir = str(sys.argv[5])
-version = str(sys.argv[6])
-is_half = bool(sys.argv[7])
-embedder_model = str(sys.argv[8])
+from rvc.lib.utils import load_embedding
+from rvc.configs.config import Config
 
-try:
-    embedder_model_custom = str(sys.argv[9])
-except:
-    embedder_model_custom = None
+config = Config()
 
-os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
 
-wav_path = f"{exp_dir}/sliced_audios_16k"
-out_path = f"{exp_dir}/v1_extracted" if version == "v1" else f"{exp_dir}/v2_extracted"
-os.makedirs(out_path, exist_ok=True)
+def setup_paths(exp_dir: str, version: str):
+    """Set up input and output paths."""
+    wav_path = os.path.join(exp_dir, "sliced_audios_16k")
+    out_path = os.path.join(
+        exp_dir, "v1_extracted" if version == "v1" else "v2_extracted"
+    )
+    os.makedirs(out_path, exist_ok=True)
+    return wav_path, out_path
 
 
-def read_wave(wav_path, normalize=False):
+def read_wave(wav_path: str, normalize: bool = False):
+    """Read a wave file and return its features."""
     wav, sr = sf.read(wav_path)
-    assert sr == 16000
+    assert sr == 16000, "Sample rate must be 16000"
+
     feats = torch.from_numpy(wav)
-    feats = feats.half() if is_half else feats.float()
+    feats = feats.half() if config.is_half else feats.float()
     feats = feats.mean(-1) if feats.dim() == 2 else feats
     feats = feats.view(1, -1)
+
     if normalize:
         with torch.no_grad():
             feats = F.layer_norm(feats, feats.shape)
     return feats
 
 
-print("Starting feature extraction...")
-
-start_time = time.time()
-
-
-models, saved_cfg, task = load_embedding(embedder_model, embedder_model_custom)
-model = models[0]
-model = model.to(device)
-if device not in ["mps", "cpu"]:
-    model = model.half()
-model.eval()
-
-todo = sorted(os.listdir(wav_path))[i_part::n_parts]
-n = max(1, len(todo) // 10)
-
-if len(todo) == 0:
-    print(
-        "An error occurred in the feature extraction, make sure you have provided the audios correctly."
-    )
-else:
-    # print(f"{len(todo)}")
-    with tqdm.tqdm(total=len(todo)) as pbar:
-        for idx, file in enumerate(todo):
-            try:
-                if file.endswith(".wav"):
-                    wav_file_path = os.path.join(wav_path, file)
-                    out_file_path = os.path.join(out_path, file.replace("wav", "npy"))
-
-                    if os.path.exists(out_file_path):
-                        continue
-
-                    feats = read_wave(wav_file_path, normalize=saved_cfg.task.normalize)
-                    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
-                    inputs = {
-                        "source": feats.to(device),
-                        "padding_mask": padding_mask.to(device),
-                        "output_layer": 9 if version == "v1" else 12,
-                    }
-                    with torch.no_grad():
-                        logits = model.extract_features(**inputs)
-                        feats = (
-                            model.final_proj(logits[0])
-                            if version == "v1"
-                            else logits[0]
-                        )
-
-                    feats = feats.squeeze(0).float().cpu().numpy()
-                    if np.isnan(feats).sum() == 0:
-                        np.save(out_file_path, feats, allow_pickle=False)
-                    else:
-                        print(f"{file} is invalid")
-                    pbar.set_description(f"Processing {file} {feats.shape}")
-            except Exception as error:
-                print(f"An error occurred processing {file}: {error}")
-            pbar.update(1)
-
+def process_file(
+    file: str,
+    wav_path: str,
+    out_path: str,
+    model: torch.nn.Module,
+    device: str,
+    version: str,
+    saved_cfg: Config,
+):
+    """Process a single audio file."""
+    wav_file_path = os.path.join(wav_path, file)
+    out_file_path = os.path.join(out_path, file.replace("wav", "npy"))
+
+    if os.path.exists(out_file_path):
+        return
+
+    # Load and prepare features
+    feats = read_wave(wav_file_path, normalize=saved_cfg.task.normalize)
+
+    # Adjust dtype based on the device
+    dtype = torch.float16 if device.startswith("cuda") else torch.float32
+    feats = feats.to(dtype).to(device)
+
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False).to(dtype).to(device)
+
+    inputs = {
+        "source": feats,
+        "padding_mask": padding_mask,
+        "output_layer": 9 if version == "v1" else 12,
+    }
+
+    with torch.no_grad():
+        model = model.to(device).to(dtype)
+
+        logits = model.extract_features(**inputs)
+        feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+
+    feats = feats.squeeze(0).float().cpu().numpy()
+    if not np.isnan(feats).any():
+        np.save(out_file_path, feats, allow_pickle=False)
+    else:
+        print(f"{file} contains NaN values and will be skipped.")
+
+
+def main():
+    """Main function to orchestrate the feature extraction process."""
+    try:
+        exp_dir = str(sys.argv[1])
+        version = str(sys.argv[2])
+        gpus = str(sys.argv[3])
+        embedder_model = str(sys.argv[4])
+        embedder_model_custom = str(sys.argv[5]) if len(sys.argv) > 5 else None
+
+        os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",")
+    except IndexError:
+        print("Invalid arguments provided.")
+        sys.exit(1)
+
+    wav_path, out_path = setup_paths(exp_dir, version)
+
+    print("Starting feature extraction...")
+    start_time = time.time()
+
+    models, saved_cfg, task = load_embedding(embedder_model, embedder_model_custom)
+    model = models[0]
+
+    gpus = gpus.split("-") if gpus != "-" else ["cpu"]
+
+    devices = []
+    for gpu in gpus:
+        try:
+            if gpu != "cpu":
+                index = int(gpu)
+                if index < torch.cuda.device_count():
+                    devices.append(f"cuda:{index}")
+                else:
+                    print(
+                        f"Oops, there was an issue initializing GPU. Maybe you don't have a GPU? No worries, switching to CPU for now."
+                    )
+                    devices.append("cpu")
+            else:
+                devices.append("cpu")
+        except ValueError:
+            f"Oops, there was an issue initializing GPU. Maybe you don't have a GPU? No worries, switching to CPU for now."
+            devices.append("cpu")
+
+    paths = sorted(os.listdir(wav_path))
+    if not paths:
+        print("No audio files found. Make sure you have provided the audios correctly.")
+        sys.exit(1)
+
+    pbar = tqdm.tqdm(total=len(paths), desc="Feature Extraction")
+
+    # Create a list of tasks to be processed
+    tasks = [
+        (
+            file,
+            wav_path,
+            out_path,
+            model,
+            device,
+            version,
+            saved_cfg,
+        )
+        for file in paths
+        if file.endswith(".wav")
+        for device in devices
+    ]
+
+    # Process files
+    for task in tasks:
+        try:
+            process_file(*task)
+        except Exception as error:
+            print(f"An error occurred processing {task[0]}: {error}")
+        pbar.update(1)
+
+    pbar.close()
     elapsed_time = time.time() - start_time
     print(f"Feature extraction completed in {elapsed_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py
index 10b8a4a..4803b60 100644
--- a/rvc/train/preprocess/preprocess.py
+++ b/rvc/train/preprocess/preprocess.py
@@ -1,13 +1,11 @@
-from multiprocessing import cpu_count
 import os
 import sys
 import time
-from typing import List, Tuple
-import multiprocessing
-from scipy import signal
-from scipy.io import wavfile
 import librosa
 import numpy as np
+from scipy import signal
+from scipy.io import wavfile
+from multiprocessing import cpu_count, Pool
 
 now_directory = os.getcwd()
 sys.path.append(now_directory)
@@ -28,15 +26,11 @@
 MAX_AMPLITUDE = 0.9
 ALPHA = 0.75
 HIGH_PASS_CUTOFF = 48
+SAMPLE_RATE_16K = 16000
 
 # Define directory paths
-GT_WAVS_DIR = f"{experiment_directory}/sliced_audios"
-WAVS16K_DIR = f"{experiment_directory}/sliced_audios_16k"
-
-# Create directories if they don't exist
-os.makedirs(experiment_directory, exist_ok=True)
-os.makedirs(GT_WAVS_DIR, exist_ok=True)
-os.makedirs(WAVS16K_DIR, exist_ok=True)
+GT_WAVS_DIR = os.path.join(experiment_directory, "sliced_audios")
+WAVS16K_DIR = os.path.join(experiment_directory, "sliced_audios_16k")
 
 
 class PreProcess:
@@ -56,27 +50,37 @@ def __init__(self, sr: int, exp_dir: str, per: float):
         self.per = per
         self.exp_dir = exp_dir
 
-    def normalize_and_write(self, tmp_audio: np.ndarray, idx0: int, idx1: int):
-        tmp_max = np.abs(tmp_audio).max()
+    def _normalize_audio(self, audio: np.ndarray):
+        """Normalizes the audio to the desired amplitude."""
+        tmp_max = np.abs(audio).max()
         if tmp_max > 2.5:
-            print(f"{idx0}-{idx1}-{tmp_max}-filtered")
+            return None  # Indicate audio should be filtered out
+        return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio
+
+    def _write_audio(self, audio: np.ndarray, filename: str, sr: int):
+        """Writes the audio to a WAV file."""
+        wavfile.write(filename, sr, audio.astype(np.float32))
+
+    def process_audio_segment(self, audio_segment: np.ndarray, idx0: int, idx1: int):
+        """Processes a single audio segment."""
+        normalized_audio = self._normalize_audio(audio_segment)
+        if normalized_audio is None:
+            print(f"{idx0}-{idx1}-filtered")
             return
-        tmp_audio = (tmp_audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (
-            1 - ALPHA
-        ) * tmp_audio
-        wavfile.write(
-            f"{GT_WAVS_DIR}/{idx0}_{idx1}.wav",
-            self.sr,
-            tmp_audio.astype(np.float32),
-        )
-        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
-        wavfile.write(
-            f"{WAVS16K_DIR}/{idx0}_{idx1}.wav",
-            16000,
-            tmp_audio.astype(np.float32),
+
+        # Write original sample rate audio
+        gt_wav_path = os.path.join(GT_WAVS_DIR, f"{idx0}_{idx1}.wav")
+        self._write_audio(normalized_audio, gt_wav_path, self.sr)
+
+        # Resample and write 16kHz audio
+        audio_16k = librosa.resample(
+            normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K
         )
+        wav_16k_path = os.path.join(WAVS16K_DIR, f"{idx0}_{idx1}.wav")
+        self._write_audio(audio_16k, wav_16k_path, SAMPLE_RATE_16K)
 
     def process_audio(self, path: str, idx0: int):
+        """Processes a single audio file."""
         try:
             audio = load_audio(path, self.sr)
             audio = signal.lfilter(self.b_high, self.a_high, audio)
@@ -91,35 +95,37 @@ def process_audio(self, path: str, idx0: int):
                         tmp_audio = audio_segment[
                             start : start + int(self.per * self.sr)
                         ]
-                        self.normalize_and_write(tmp_audio, idx0, idx1)
+                        self.process_audio_segment(tmp_audio, idx0, idx1)
                         idx1 += 1
                     else:
                         tmp_audio = audio_segment[start:]
+                        self.process_audio_segment(tmp_audio, idx0, idx1)
                         idx1 += 1
                         break
-                self.normalize_and_write(tmp_audio, idx0, idx1)
         except Exception as error:
             print(f"An error occurred on {path} path: {error}")
 
-    def process_audio_multiprocessing(self, infos: List[Tuple[str, int]]):
-        for path, idx0 in infos:
-            self.process_audio(path, idx0)
+    def process_audio_file(self, file_path_idx):
+        file_path, idx0 = file_path_idx
+        self.process_audio(file_path, idx0)
 
     def process_audio_multiprocessing_input_directory(
         self, input_root: str, num_processes: int
     ):
-        try:
-            infos = [
-                (f"{input_root}/{name}", idx)
-                for idx, name in enumerate(sorted(list(os.listdir(input_root))))
-            ]
-            with multiprocessing.Pool(processes=num_processes) as pool:
-                pool.map(
-                    self.process_audio_multiprocessing,
-                    [infos[i::num_processes] for i in range(num_processes)],
-                )
-        except Exception as error:
-            print(f"An error occurred on {input_root} path: {error}")
+        # Get list of files
+        files = [
+            (os.path.join(input_root, f), idx)
+            for idx, f in enumerate(os.listdir(input_root))
+            if f.endswith(".wav")
+        ]
+
+        # Create the directories if they don't exist
+        os.makedirs(GT_WAVS_DIR, exist_ok=True)
+        os.makedirs(WAVS16K_DIR, exist_ok=True)
+
+        # Use multiprocessing to process files
+        with Pool(processes=num_processes) as pool:
+            pool.map(self.process_audio_file, files)
 
 
 def preprocess_training_set(
diff --git a/rvc/train/train.py b/rvc/train/train.py
index 07f3f39..52d8618 100644
--- a/rvc/train/train.py
+++ b/rvc/train/train.py
@@ -123,7 +123,8 @@ def main():
     """
     Main function to start the training process.
     """
-
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(randint(20000, 55555))
     def start():
         """
         Starts the training process with multi-GPU support.
@@ -267,8 +268,6 @@ def run(
         writer = SummaryWriter(log_dir=experiment_dir)
         writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval"))
 
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(randint(20000, 55555))
     dist.init_process_group(
         backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
     )
diff --git a/rvc_cli.py b/rvc_cli.py
index b9067f4..5c184ff 100644
--- a/rvc_cli.py
+++ b/rvc_cli.py
@@ -199,12 +199,17 @@ def run_tts_script(
         os.remove(output_tts_path)
 
     command_tts = [
-        python,
-        tts_script_path,
-        tts_text,
-        tts_voice,
-        tts_rate,
-        output_tts_path,
+        *map(
+            str,
+            [
+                python,
+                tts_script_path,
+                tts_text,
+                tts_voice,
+                tts_rate,
+                output_tts_path,
+            ],
+        ),
     ]
     subprocess.run(command_tts)
     infer_pipeline = import_voice_converter()
@@ -270,6 +275,7 @@ def run_extract_script(
     pitch_guidance: bool,
     hop_length: int,
     cpu_cores: int,
+    gpu: int,
     sample_rate: int,
     embedder_model: str,
     embedder_model_custom: str = None,
@@ -293,6 +299,7 @@ def run_extract_script(
                 f0_method,
                 hop_length,
                 cpu_cores,
+                gpu,
             ],
         ),
     ]
@@ -303,13 +310,9 @@ def run_extract_script(
         *map(
             str,
             [
-                config.device,
-                1,
-                0,
-                0,
                 model_path,
                 rvc_version,
-                config.is_half,
+                gpu,
                 embedder_model,
                 embedder_model_custom,
             ],
@@ -1035,6 +1038,12 @@ def parse_arguments():
         choices=range(1, 65),
         default=None,
     )
+    extract_parser.add_argument(
+        "--gpu",
+        type=int,
+        help="GPU device to use for feature extraction (optional).",
+        default="-",
+    )
     extract_parser.add_argument(
         "--sample_rate",
         type=int,
@@ -1441,6 +1450,7 @@ def main():
                 pitch_guidance=args.pitch_guidance,
                 hop_length=args.hop_length,
                 cpu_cores=args.cpu_cores,
+                gpu=args.gpu,
                 sample_rate=args.sample_rate,
                 embedder_model=args.embedder_model,
                 embedder_model_custom=args.embedder_model_custom,