tensorchord · kemingy · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025
diff --git a/vechord/augment.py b/vechord/augment.py
@@ -4,6 +4,8 @@
 import httpx
 import msgspec
 
+from vechord.utils import GEMINI_GENERATE_RPS, RateLimitTransport
+
 
 class BaseAugmenter(ABC):
     @abstractmethod
@@ -49,6 +51,7 @@ def __init__(self, model: str = "gemini-2.5-flash"):
         self.client = httpx.AsyncClient(
             headers={"Content-Type": "application/json"},
             timeout=httpx.Timeout(120.0, connect=5.0),
+            transport=RateLimitTransport(max_per_second=GEMINI_GENERATE_RPS),
         )
 
     async def __aenter__(self):

diff --git a/vechord/chunk.py b/vechord/chunk.py
@@ -5,6 +5,8 @@
 import httpx
 import msgspec
 
+from vechord.utils import GEMINI_GENERATE_RPS, RateLimitTransport
+
 
 class BaseChunker(ABC):
     @abstractmethod
@@ -117,6 +119,7 @@ def __init__(self, model: str = "gemini-2.5-flash", size: int = 1536):
         self.client = httpx.AsyncClient(
             headers={"Content-Type": "application/json"},
             timeout=httpx.Timeout(120.0, connect=5.0),
+            transport=RateLimitTransport(max_per_second=GEMINI_GENERATE_RPS),
         )
         self.prompt = f"""
 You are an expert text chunker, skilled at dividing documents into meaningful 

diff --git a/vechord/embedding.py b/vechord/embedding.py
@@ -8,6 +8,7 @@
 
 from vechord.log import logger
 from vechord.model import SparseEmbedding
+from vechord.utils import GEMINI_EMBEDDING_RPS, RateLimitTransport
 
 
 class VecType(Enum):
@@ -91,6 +92,7 @@ def __init__(self, model: str = "gemini-embedding-exp-03-07", dim: int = 3072):
             params={"key": self.api_key},
             headers={"Content-Type": "application/json"},
             timeout=httpx.Timeout(30.0, connect=10.0),
+            transport=RateLimitTransport(max_per_second=GEMINI_EMBEDDING_RPS),
         )
 
     async def __aenter__(self):

diff --git a/vechord/entity.py b/vechord/entity.py
@@ -5,6 +5,7 @@
 import msgspec
 
 from vechord.model import Entity, Relation
+from vechord.utils import GEMINI_GENERATE_RPS, RateLimitTransport
 
 
 class BaseEntityRecognizer(ABC):
@@ -142,6 +143,7 @@ def __init__(self, model: str = "gemini-2.5-flash"):
         self.client = httpx.AsyncClient(
             headers={"Content-Type": "application/json"},
             timeout=httpx.Timeout(30.0, connect=5.0),
+            transport=RateLimitTransport(max_per_second=GEMINI_GENERATE_RPS),
         )
 
     async def __aenter__(self):

diff --git a/vechord/evaluate.py b/vechord/evaluate.py
@@ -7,6 +7,7 @@
 import pytrec_eval
 
 from vechord.model import RetrievedChunk
+from vechord.utils import GEMINI_GENERATE_RPS, RateLimitTransport
 
 
 class BaseEvaluator(ABC):
@@ -79,6 +80,7 @@ def __init__(self, model: str = "gemini-2.5-flash"):
         self.client = httpx.AsyncClient(
             headers={"Content-Type": "application/json"},
             timeout=httpx.Timeout(120.0, connect=5.0),
+            transport=RateLimitTransport(max_per_second=GEMINI_GENERATE_RPS),
         )
         self.prompt = """
 Given the following chunk of text and the overall document it belongs to, generate 

diff --git a/vechord/extract.py b/vechord/extract.py
@@ -9,6 +9,7 @@
 
 from vechord.log import logger
 from vechord.model import Document
+from vechord.utils import GEMINI_GENERATE_RPS, RateLimitTransport
 
 
 class BaseHTMLParser(HTMLParser):
@@ -109,6 +110,7 @@ def __init__(self, model: str = "gemini-2.5-flash"):
         self.client = httpx.AsyncClient(
             timeout=httpx.Timeout(10.0, read=120.0),
             headers={"Content-Type": "application/json"},
+            transport=RateLimitTransport(max_per_second=GEMINI_GENERATE_RPS),
         )
 
     def name(self) -> str:

diff --git a/vechord/utils.py b/vechord/utils.py
@@ -0,0 +1,55 @@
+import asyncio
+import sys
+from typing import Any
+
+if sys.version_info >= (3, 11):
+    from typing import Self
+else:
+    from typing_extensions import Self
+
+
+import httpx
+
+# https://ai.google.dev/gemini-api/docs/rate-limits#tier-1
+GEMINI_GENERATE_RPS = 16.66
+GEMINI_EMBEDDING_RPS = 0.6
+
+
+class RateLimitTransport(httpx.AsyncHTTPTransport):
+    def __init__(self, max_per_second: float = 5, **kwargs) -> None:
+        """
+        Async HTTP transport with rate limit.
+
+        Args:
+            max_per_second: Maximum number of requests per second.
+
+        Other args are passed to httpx.AsyncHTTPTransport.
+        """
+        self.interval = 1 / max_per_second
+        self.next_start_time = 0
+        super().__init__(**kwargs)
+
+    async def notify_task_start(self):
+        """
+        https://github.com/florimondmanca/aiometer/blob/358976e0b60bce29b9fe8c59807fafbad3e62cbc/src/aiometer/_impl/meters.py#L57
+        """
+        loop = asyncio.get_running_loop()
+        while True:
+            now = loop.time()
+            next_start_time = max(self.next_start_time, now)
+            until_now = next_start_time - now
+            if until_now <= self.interval:
+                break
+            await asyncio.sleep(max(0, until_now - self.interval))
+        self.next_start_time = max(self.next_start_time, now) + self.interval
+
+    async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
+        await self.notify_task_start()
+        return await super().handle_async_request(request)
+
+    async def __aenter__(self) -> Self:
+        await self.notify_task_start()
+        return await super().__aenter__()
+
+    async def __aexit__(self, *args: Any) -> None:
+        await super().__aexit__(*args)