Format

freddyaboulton · freddyaboulton · commit 7fcdeee7488e · 2025-03-26T10:40:45.000-07:00
diff --git a/src/orpheus_cpp/__main__.py b/src/orpheus_cpp/__main__.py
@@ -21,7 +21,9 @@
 )
 from fastrtc.utils import create_message
 from huggingface_hub import InferenceClient
+
 from orpheus_cpp.model import OrpheusCpp
+
 async_client = httpx.AsyncClient()
 
 client = InferenceClient(model="meta-llama/Llama-3.2-3B-Instruct")
@@ -46,6 +48,7 @@ def generate_message():
         msg = msg.replace('"', "")
     return msg
 
+
 model = OrpheusCpp()
 
 
@@ -68,7 +71,9 @@ async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]) -> None:
             all_audio = np.array([], dtype=np.int16)
             started_playback = False
 
-            async for (sample_rate, chunk) in model.stream_tts(msg, options={"voice_id": voice_id}):
+            async for sample_rate, chunk in model.stream_tts(
+                msg, options={"voice_id": voice_id}
+            ):
                 all_audio = np.concatenate([all_audio, chunk.squeeze()])
                 if not started_playback:
                     started_playback = True
diff --git a/src/orpheus_cpp/model.py b/src/orpheus_cpp/model.py
@@ -1,12 +1,21 @@
-import onnxruntime
+import asyncio
+import platform
+import sys
+import threading
+from typing import (
+    AsyncGenerator,
+    Generator,
+    Iterator,
+    Literal,
+    NotRequired,
+    TypedDict,
+    cast,
+)
+
 import numpy as np
+import onnxruntime
 from huggingface_hub import hf_hub_download
-import sys
-import platform
-from typing import Generator, Iterator, cast, TypedDict, AsyncGenerator, NotRequired, Literal
 from numpy.typing import NDArray
-import asyncio
-import threading
 
 
 class TTSOptions(TypedDict):
@@ -22,42 +31,56 @@ class TTSOptions(TypedDict):
     """Minimum probability for top-p sampling. Default: 0.05"""
     pre_buffer_size: NotRequired[float]
     """Seconds of audio to generate before yielding the first chunk. Smoother audio streaming at the cost of higher time to wait for the first chunk."""
-    voice_id: NotRequired[Literal["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]]
+    voice_id: NotRequired[
+        Literal["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
+    ]
     """The voice to use for the TTS. Default: "tara"."""
 
+
 CUSTOM_TOKEN_PREFIX = "<custom_token_"
 
+
 class OrpheusCpp:
     def __init__(self, verbose: bool = True):
         import importlib.util
+
         if importlib.util.find_spec("llama_cpp") is None:
             if sys.platform == "darwin":
                 # Check if macOS 11.0+ on arm64 (Apple Silicon)
                 is_arm64 = platform.machine() == "arm64"
                 version = platform.mac_ver()[0].split(".")
                 is_macos_11_plus = len(version) >= 2 and int(version[0]) >= 11
                 is_macos_10_less = len(version) >= 2 and int(version[0]) < 11
-                
+
                 if is_arm64 and is_macos_11_plus:
                     extra_index_url = "--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal"
                 elif is_macos_10_less:
-                    raise ImportError("llama_cpp does not have pre-built wheels for macOS 10.x "
-                                      "Follow install instructions at https://github.com/abetlen/llama-cpp-python")
+                    raise ImportError(
+                        "llama_cpp does not have pre-built wheels for macOS 10.x "
+                        "Follow install instructions at https://github.com/abetlen/llama-cpp-python"
+                    )
                 else:
                     extra_index_url = "--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
             else:
                 extra_index_url = "--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
 
-            raise ImportError(f"llama_cpp is not installed. Please install it using `pip install llama-cpp-python {extra_index_url}`.")
+            raise ImportError(
+                f"llama_cpp is not installed. Please install it using `pip install llama-cpp-python {extra_index_url}`."
+            )
 
-        model_file = hf_hub_download(repo_id="isaiahbjork/orpheus-3b-0.1-ft-Q4_K_M-GGUF",
-                                     filename="orpheus-3b-0.1-ft-q4_k_m.gguf")
+        model_file = hf_hub_download(
+            repo_id="isaiahbjork/orpheus-3b-0.1-ft-Q4_K_M-GGUF",
+            filename="orpheus-3b-0.1-ft-q4_k_m.gguf",
+        )
         from llama_cpp import Llama
+
         self._llm = Llama(model_path=model_file, n_ctx=0, verbose=verbose)
 
         repo_id = "onnx-community/snac_24khz-ONNX"
         snac_model_file = "decoder_model.onnx"
-        snac_model_path = hf_hub_download(repo_id, subfolder="onnx", filename=snac_model_file)
+        snac_model_path = hf_hub_download(
+            repo_id, subfolder="onnx", filename=snac_model_file
+        )
 
         # Load SNAC model with optimizations
         self._snac_session = onnxruntime.InferenceSession(
@@ -67,16 +90,16 @@ def __init__(self, verbose: bool = True):
 
     def _token_to_id(self, token_text: str, index: int) -> int | None:
         token_string = token_text.strip()
-    
+
         # Find the last token in the string
         last_token_start = token_string.rfind(CUSTOM_TOKEN_PREFIX)
-        
+
         if last_token_start == -1:
             return None
-        
+
         # Extract the last token
         last_token = token_string[last_token_start:]
-        
+
         # Process the last token
         if last_token.startswith(CUSTOM_TOKEN_PREFIX) and last_token.endswith(">"):
             try:
@@ -87,8 +110,10 @@ def _token_to_id(self, token_text: str, index: int) -> int | None:
                 return None
         else:
             return None
-    
-    def _decode(self, token_gen: Generator[str, None, None]) -> Generator[np.ndarray, None, None]:
+
+    def _decode(
+        self, token_gen: Generator[str, None, None]
+    ) -> Generator[np.ndarray, None, None]:
         """Asynchronous token decoder that converts token stream to audio stream."""
         buffer = []
         count = 0
@@ -97,79 +122,90 @@ def _decode(self, token_gen: Generator[str, None, None]) -> Generator[np.ndarray
             if token is not None and token > 0:
                 buffer.append(token)
                 count += 1
-                
+
                 # Convert to audio when we have enough tokens
                 if count % 7 == 0 and count > 27:
                     buffer_to_proc = buffer[-28:]
                     audio_samples = self._convert_to_audio(buffer_to_proc)
                     if audio_samples is not None:
                         yield audio_samples
-    
+
     def _convert_to_audio(self, multiframe: list[int]) -> np.ndarray | None:
         if len(multiframe) < 28:  # Ensure we have enough tokens
             return None
-        
+
         num_frames = len(multiframe) // 7
-        frame = multiframe[:num_frames*7]
-        
+        frame = multiframe[: num_frames * 7]
+
         # Initialize empty numpy arrays instead of torch tensors
         codes_0 = np.array([], dtype=np.int32)
         codes_1 = np.array([], dtype=np.int32)
         codes_2 = np.array([], dtype=np.int32)
-        
+
         for j in range(num_frames):
-            i = 7*j
+            i = 7 * j
             # Append values to numpy arrays
             codes_0 = np.append(codes_0, frame[i])
-            
-            codes_1 = np.append(codes_1, [frame[i+1], frame[i+4]])
-            
-            codes_2 = np.append(codes_2, [frame[i+2], frame[i+3], frame[i+5], frame[i+6]])
-        
+
+            codes_1 = np.append(codes_1, [frame[i + 1], frame[i + 4]])
+
+            codes_2 = np.append(
+                codes_2, [frame[i + 2], frame[i + 3], frame[i + 5], frame[i + 6]]
+            )
+
         # Reshape arrays to match the expected input format (add batch dimension)
         codes_0 = np.expand_dims(codes_0, axis=0)
         codes_1 = np.expand_dims(codes_1, axis=0)
         codes_2 = np.expand_dims(codes_2, axis=0)
-        
+
         # Check that all tokens are between 0 and 4096
-        if (np.any(codes_0 < 0) or np.any(codes_0 > 4096) or 
-            np.any(codes_1 < 0) or np.any(codes_1 > 4096) or 
-            np.any(codes_2 < 0) or np.any(codes_2 > 4096)):
+        if (
+            np.any(codes_0 < 0)
+            or np.any(codes_0 > 4096)
+            or np.any(codes_1 < 0)
+            or np.any(codes_1 > 4096)
+            or np.any(codes_2 < 0)
+            or np.any(codes_2 > 4096)
+        ):
             return None
-        
+
         # Create input dictionary for ONNX session
 
         snac_input_names = [x.name for x in self._snac_session.get_inputs()]
 
         input_dict = dict(zip(snac_input_names, [codes_0, codes_1, codes_2]))
-        
+
         # Run inference
         audio_hat = self._snac_session.run(None, input_dict)[0]
-        
+
         # Process output
         audio_np = audio_hat[:, :, 2048:4096]
         audio_int16 = (audio_np * 32767).astype(np.int16)
         audio_bytes = audio_int16.tobytes()
         return audio_bytes
 
-    def tts(self, text: str, options: TTSOptions | None = None) -> tuple[int, NDArray[np.int16]]:
+    def tts(
+        self, text: str, options: TTSOptions | None = None
+    ) -> tuple[int, NDArray[np.int16]]:
         buffer = []
         for _, array in self.stream_tts_sync(text, options):
             buffer.append(array)
         return (24_000, np.concatenate(buffer, axis=1))
-    
+
     async def stream_tts(
         self, text: str, options: TTSOptions | None = None
     ) -> AsyncGenerator[tuple[int, NDArray[np.float32]], None]:
-        
         queue = asyncio.Queue()
         finished = asyncio.Event()
+
         def strem_to_queue(text, options, queue, finished):
             for chunk in self.stream_tts_sync(text, options):
                 queue.put_nowait(chunk)
             finished.set()
-        
-        thread = threading.Thread(target=strem_to_queue, args=(text, options, queue, finished))
+
+        thread = threading.Thread(
+            target=strem_to_queue, args=(text, options, queue, finished)
+        )
         thread.start()
         while not finished.is_set():
             try:
@@ -180,18 +216,25 @@ def strem_to_queue(text, options, queue, finished):
             chunk = queue.get_nowait()
             yield chunk
 
-    def _token_gen(self, text: str, options: TTSOptions | None = None) -> Generator[str, None, None]:
+    def _token_gen(
+        self, text: str, options: TTSOptions | None = None
+    ) -> Generator[str, None, None]:
         from llama_cpp import CreateCompletionStreamResponse
+
         options = options or TTSOptions()
         voice_id = options.get("voice_id", "tara")
         text = f"<|audio|>{voice_id}: {text}<|eot_id|><custom_token_4>"
-        token_gen = self._llm(text, max_tokens=options.get("max_tokens", 2_048), stream=True,
-                              temperature=options.get("temperature", 0.8),
-                              top_p=options.get("top_p", 0.95),
-                              top_k=options.get("top_k", 40),
-                              min_p=options.get("min_p", 0.05))
+        token_gen = self._llm(
+            text,
+            max_tokens=options.get("max_tokens", 2_048),
+            stream=True,
+            temperature=options.get("temperature", 0.8),
+            top_p=options.get("top_p", 0.95),
+            top_k=options.get("top_k", 40),
+            min_p=options.get("min_p", 0.05),
+        )
         for token in cast(Iterator[CreateCompletionStreamResponse], token_gen):
-            yield token['choices'][0]['text']
+            yield token["choices"][0]["text"]
 
     def stream_tts_sync(
         self, text: str, options: TTSOptions | None = None
@@ -212,4 +255,3 @@ def stream_tts_sync(
                 yield (24_000, audio_array)
         if not started_playback:
             yield (24_000, pre_buffer)
-
diff --git a/tests/test.py b/tests/test.py
@@ -1,6 +1,6 @@
-from orpheus_cpp import OrpheusCpp
 from scipy.io.wavfile import write
 
+from orpheus_cpp import OrpheusCpp
 
 orpheus = OrpheusCpp()