the-open-agent
diff --git a/‎.goreleaser.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.goreleaser.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/ocr-service/Dockerfile‎
Lines changed: 22 additions & 0 deletions b/‎deploy/ocr-service/Dockerfile‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎deploy/ocr-service/README.md‎
Lines changed: 50 additions & 0 deletions b/‎deploy/ocr-service/README.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎deploy/ocr-service/app.py‎
Lines changed: 73 additions & 0 deletions b/‎deploy/ocr-service/app.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎deploy/ocr-service/requirements.txt‎
Lines changed: 7 additions & 0 deletions b/‎deploy/ocr-service/requirements.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docker-compose.ocr.yml‎
Lines changed: 8 additions & 0 deletions b/‎docker-compose.ocr.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎embed.go‎
Lines changed: 8 additions & 4 deletions b/‎embed.go‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎embedsupport/setup.go‎
Lines changed: 12 additions & 7 deletions b/‎embedsupport/setup.go‎
Lines changed: 12 additions & 7 deletions
@@ -24,7 +24,7 @@ builds:
       - arm64
 
 # Release raw executables — no zip/tar.gz wrapper.
-# conf/, web/build/, and skills/ are baked into the binary via go:embed.
+# conf/, web/build/, skills/, and deploy/ocr-service/ are baked into the binary via go:embed.
 archives:
   - format: binary
     name_template: >-
 
@@ -0,0 +1,22 @@
+FROM python:3.10-slim-bookworm
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends libglib2.0-0 libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt \
+    && pip uninstall -y opencv-python \
+    && pip install --no-cache-dir opencv-python-headless
+
+COPY app.py .
+
+EXPOSE 8001
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8001"]
@@ -0,0 +1,50 @@
+# OpenAgent OCR Service
+
+This service provides the OCR endpoint used by the `local_file` tool.
+
+## API
+
+- `GET /health`
+- `POST /ocr/pdf`
+  - `multipart/form-data`
+  - file field: `file`
+  - response: `{"text":"recognized text"}`
+
+## Run with OpenAgent
+
+When an active store enables the `local_file` tool and the tool Provider URL is empty, OpenAgent warms up the managed OCR service in the background during startup. If the first `local_pdf_ocr_read` call arrives before warmup finishes, that call waits for the same managed OCR startup instead of starting a second service.
+
+The managed Python environment is created under:
+
+```text
+tmp/ocr-service/.venv
+```
+
+If the `local_file` tool Provider URL is empty, OpenAgent uses the managed endpoint by default:
+
+```text
+http://127.0.0.1:8001/ocr/pdf
+```
+
+Python 3.10+ must already be installed on the machine. OpenAgent installs the Python package dependencies into the managed virtual environment during the first managed OCR startup or background warmup.
+
+## Run with Docker
+
+From the OpenAgent repository root:
+
+```bash
+docker compose -f docker-compose.ocr.yml up --build
+```
+
+To force OpenAgent to use the Docker service instead of the managed service, set the `local_file` tool Provider URL to:
+
+```text
+http://127.0.0.1:8001/ocr/pdf
+```
+
+## Test
+
+```bash
+curl http://127.0.0.1:8001/health
+curl -F "file=@/absolute/path/to/scanned.pdf" http://127.0.0.1:8001/ocr/pdf
+```
@@ -0,0 +1,73 @@
+import os
+import tempfile
+from threading import Lock
+
+import pypdfium2 as pdfium
+from fastapi import FastAPI, File, HTTPException, UploadFile
+from rapidocr import RapidOCR
+
+
+app = FastAPI(title="OpenAgent OCR Service")
+ocr_engine = RapidOCR()
+ocr_lock = Lock()
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/ocr/pdf")
+async def ocr_pdf(file: UploadFile = File(...)):
+    content = await file.read()
+    if not content.startswith(b"%PDF"):
+        raise HTTPException(status_code=400, detail="file must be a PDF")
+
+    try:
+        text = read_pdf_text(content)
+    except Exception as err:
+        raise HTTPException(status_code=500, detail=f"failed to OCR PDF: {err}") from err
+
+    return {"text": text}
+
+
+def read_pdf_text(content: bytes) -> str:
+    temp_path = write_temp_pdf(content)
+    try:
+        pdf = pdfium.PdfDocument(temp_path)
+        try:
+            page_texts = [read_page_text(pdf[index], index + 1) for index in range(len(pdf))]
+        finally:
+            pdf.close()
+    finally:
+        os.remove(temp_path)
+
+    return "\n\n".join(text for text in page_texts if text)
+
+
+def write_temp_pdf(content: bytes) -> str:
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
+        temp_file.write(content)
+        return temp_file.name
+
+
+def read_page_text(page, page_number: int) -> str:
+    try:
+        bitmap = page.render(scale=2.0)
+        try:
+            image = bitmap.to_pil().convert("RGB")
+        finally:
+            bitmap.close()
+    finally:
+        page.close()
+
+    with ocr_lock:
+        result = ocr_engine(image)
+
+    if result is None or result.txts is None:
+        return ""
+
+    text = "\n".join(item for item in result.txts if item)
+    if text == "":
+        return ""
+    return f"Page {page_number}\n{text}"
@@ -0,0 +1,7 @@
+fastapi
+uvicorn[standard]
+python-multipart
+rapidocr
+onnxruntime
+pypdfium2
+opencv-python-headless
@@ -0,0 +1,8 @@
+services:
+  openagent-ocr:
+    restart: unless-stopped
+    build:
+      context: ./deploy/ocr-service
+      dockerfile: Dockerfile
+    ports:
+      - "8001:8001"
@@ -15,9 +15,9 @@
 //go:build embed
 
 // This file is only compiled when building with -tags embed.
-// It embeds conf/, web/build/ (without source-map files), and skills/ into
-// the binary, and wires them up via embedsupport.Setup so that the server
-// can run from a single executable without any on-disk assets.
+// It embeds conf/, web/build/ (without source-map files), skills/, and the
+// OCR service into the binary, and wires them up via embedsupport.Setup so
+// that the server can run from a single executable without any on-disk assets.
 // On-disk files always take priority over the embedded versions at runtime.
 
 package main
@@ -44,9 +44,13 @@ var _embeddedWeb embed.FS
 //go:embed skills
 var _embeddedSkills embed.FS
 
+//go:embed deploy/ocr-service
+var _embeddedOcrService embed.FS
+
 func init() {
 	confFS, _ := fs.Sub(_embeddedConf, "conf")
 	webFS, _ := fs.Sub(_embeddedWeb, "web/build")
 	skillsFS, _ := fs.Sub(_embeddedSkills, "skills")
-	embedsupport.Setup(confFS, webFS, skillsFS)
+	ocrServiceFS, _ := fs.Sub(_embeddedOcrService, "deploy/ocr-service")
+	embedsupport.Setup(confFS, webFS, skillsFS, ocrServiceFS)
 }
@@ -13,24 +13,26 @@
 // limitations under the License.
 
 // Package embedsupport wires up the optional embedded filesystems for conf,
-// web/build, and skills. When the binary is built with -tags embed, the
-// caller (main) passes the embedded fs.FS values here via Setup. At runtime,
-// on-disk files always take priority; the embedded versions are used only when
-// the corresponding directory is absent next to the executable.
+// web/build, skills, and the OCR service. When the binary is built with
+// -tags embed, the caller (main) passes the embedded fs.FS values here via
+// Setup. At runtime, on-disk files always take priority; the embedded versions
+// are used only when the corresponding directory is absent next to the executable.
 package embedsupport
 
 import "io/fs"
 
 var (
-	webFS    fs.FS
-	skillsFS fs.FS
+	webFS        fs.FS
+	skillsFS     fs.FS
+	ocrServiceFS fs.FS
 )
 
 // Setup must be called at the very start of main(), before any config values
 // are read or HTTP requests are served.
-func Setup(conf, web, skills fs.FS) {
+func Setup(conf, web, skills, ocrService fs.FS) {
 	webFS = web
 	skillsFS = skills
+	ocrServiceFS = ocrService
 	setupConf(conf)
 }
 
@@ -39,3 +41,6 @@ func WebFS() fs.FS { return webFS }
 
 // SkillsFS returns the embedded skills filesystem, or nil if not available.
 func SkillsFS() fs.FS { return skillsFS }
+
+// OcrServiceFS returns the embedded OCR service filesystem, or nil if not available.
+func OcrServiceFS() fs.FS { return ocrServiceFS }