Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6911e49

Browse files
authored
feat: support PDF OCR recognition (#2229)
1 parent b17e859 commit 6911e49

15 files changed

Lines changed: 901 additions & 16 deletions

File tree

.goreleaser.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ builds:
2424
- arm64
2525

2626
# Release raw executables — no zip/tar.gz wrapper.
27-
# conf/, web/build/, and skills/ are baked into the binary via go:embed.
27+
# conf/, web/build/, skills/, and deploy/ocr-service/ are baked into the binary via go:embed.
2828
archives:
2929
- format: binary
3030
name_template: >-

deploy/ocr-service/Dockerfile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM python:3.10-slim-bookworm
2+
3+
ENV PYTHONDONTWRITEBYTECODE=1
4+
ENV PYTHONUNBUFFERED=1
5+
6+
WORKDIR /app
7+
8+
RUN apt-get update \
9+
&& apt-get install -y --no-install-recommends libglib2.0-0 libgomp1 \
10+
&& rm -rf /var/lib/apt/lists/*
11+
12+
COPY requirements.txt .
13+
RUN pip install --no-cache-dir --upgrade pip \
14+
&& pip install --no-cache-dir -r requirements.txt \
15+
&& pip uninstall -y opencv-python \
16+
&& pip install --no-cache-dir opencv-python-headless
17+
18+
COPY app.py .
19+
20+
EXPOSE 8001
21+
22+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8001"]

deploy/ocr-service/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# OpenAgent OCR Service
2+
3+
This service provides the OCR endpoint used by the `local_file` tool.
4+
5+
## API
6+
7+
- `GET /health`
8+
- `POST /ocr/pdf`
9+
- `multipart/form-data`
10+
- file field: `file`
11+
- response: `{"text":"recognized text"}`
12+
13+
## Run with OpenAgent
14+
15+
When an active store enables the `local_file` tool and the tool Provider URL is empty, OpenAgent warms up the managed OCR service in the background during startup. If the first `local_pdf_ocr_read` call arrives before warmup finishes, that call waits for the same managed OCR startup instead of starting a second service.
16+
17+
The managed Python environment is created under:
18+
19+
```text
20+
tmp/ocr-service/.venv
21+
```
22+
23+
If the `local_file` tool Provider URL is empty, OpenAgent uses the managed endpoint by default:
24+
25+
```text
26+
http://127.0.0.1:8001/ocr/pdf
27+
```
28+
29+
Python 3.10+ must already be installed on the machine. OpenAgent installs the Python package dependencies into the managed virtual environment during the first managed OCR startup or background warmup.
30+
31+
## Run with Docker
32+
33+
From the OpenAgent repository root:
34+
35+
```bash
36+
docker compose -f docker-compose.ocr.yml up --build
37+
```
38+
39+
To force OpenAgent to use the Docker service instead of the managed service, set the `local_file` tool Provider URL to:
40+
41+
```text
42+
http://127.0.0.1:8001/ocr/pdf
43+
```
44+
45+
## Test
46+
47+
```bash
48+
curl http://127.0.0.1:8001/health
49+
curl -F "file=@/absolute/path/to/scanned.pdf" http://127.0.0.1:8001/ocr/pdf
50+
```

deploy/ocr-service/app.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import os
2+
import tempfile
3+
from threading import Lock
4+
5+
import pypdfium2 as pdfium
6+
from fastapi import FastAPI, File, HTTPException, UploadFile
7+
from rapidocr import RapidOCR
8+
9+
10+
app = FastAPI(title="OpenAgent OCR Service")
11+
ocr_engine = RapidOCR()
12+
ocr_lock = Lock()
13+
14+
15+
@app.get("/health")
16+
def health():
17+
return {"status": "ok"}
18+
19+
20+
@app.post("/ocr/pdf")
21+
async def ocr_pdf(file: UploadFile = File(...)):
22+
content = await file.read()
23+
if not content.startswith(b"%PDF"):
24+
raise HTTPException(status_code=400, detail="file must be a PDF")
25+
26+
try:
27+
text = read_pdf_text(content)
28+
except Exception as err:
29+
raise HTTPException(status_code=500, detail=f"failed to OCR PDF: {err}") from err
30+
31+
return {"text": text}
32+
33+
34+
def read_pdf_text(content: bytes) -> str:
35+
temp_path = write_temp_pdf(content)
36+
try:
37+
pdf = pdfium.PdfDocument(temp_path)
38+
try:
39+
page_texts = [read_page_text(pdf[index], index + 1) for index in range(len(pdf))]
40+
finally:
41+
pdf.close()
42+
finally:
43+
os.remove(temp_path)
44+
45+
return "\n\n".join(text for text in page_texts if text)
46+
47+
48+
def write_temp_pdf(content: bytes) -> str:
49+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
50+
temp_file.write(content)
51+
return temp_file.name
52+
53+
54+
def read_page_text(page, page_number: int) -> str:
55+
try:
56+
bitmap = page.render(scale=2.0)
57+
try:
58+
image = bitmap.to_pil().convert("RGB")
59+
finally:
60+
bitmap.close()
61+
finally:
62+
page.close()
63+
64+
with ocr_lock:
65+
result = ocr_engine(image)
66+
67+
if result is None or result.txts is None:
68+
return ""
69+
70+
text = "\n".join(item for item in result.txts if item)
71+
if text == "":
72+
return ""
73+
return f"Page {page_number}\n{text}"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
fastapi
2+
uvicorn[standard]
3+
python-multipart
4+
rapidocr
5+
onnxruntime
6+
pypdfium2
7+
opencv-python-headless

docker-compose.ocr.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
services:
2+
openagent-ocr:
3+
restart: unless-stopped
4+
build:
5+
context: ./deploy/ocr-service
6+
dockerfile: Dockerfile
7+
ports:
8+
- "8001:8001"

embed.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
//go:build embed
1616

1717
// This file is only compiled when building with -tags embed.
18-
// It embeds conf/, web/build/ (without source-map files), and skills/ into
19-
// the binary, and wires them up via embedsupport.Setup so that the server
20-
// can run from a single executable without any on-disk assets.
18+
// It embeds conf/, web/build/ (without source-map files), skills/, and the
19+
// OCR service into the binary, and wires them up via embedsupport.Setup so
20+
// that the server can run from a single executable without any on-disk assets.
2121
// On-disk files always take priority over the embedded versions at runtime.
2222

2323
package main
@@ -44,9 +44,13 @@ var _embeddedWeb embed.FS
4444
//go:embed skills
4545
var _embeddedSkills embed.FS
4646

47+
//go:embed deploy/ocr-service
48+
var _embeddedOcrService embed.FS
49+
4750
func init() {
4851
confFS, _ := fs.Sub(_embeddedConf, "conf")
4952
webFS, _ := fs.Sub(_embeddedWeb, "web/build")
5053
skillsFS, _ := fs.Sub(_embeddedSkills, "skills")
51-
embedsupport.Setup(confFS, webFS, skillsFS)
54+
ocrServiceFS, _ := fs.Sub(_embeddedOcrService, "deploy/ocr-service")
55+
embedsupport.Setup(confFS, webFS, skillsFS, ocrServiceFS)
5256
}

embedsupport/setup.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,26 @@
1313
// limitations under the License.
1414

1515
// Package embedsupport wires up the optional embedded filesystems for conf,
16-
// web/build, and skills. When the binary is built with -tags embed, the
17-
// caller (main) passes the embedded fs.FS values here via Setup. At runtime,
18-
// on-disk files always take priority; the embedded versions are used only when
19-
// the corresponding directory is absent next to the executable.
16+
// web/build, skills, and the OCR service. When the binary is built with
17+
// -tags embed, the caller (main) passes the embedded fs.FS values here via
18+
// Setup. At runtime, on-disk files always take priority; the embedded versions
19+
// are used only when the corresponding directory is absent next to the executable.
2020
package embedsupport
2121

2222
import "io/fs"
2323

2424
var (
25-
webFS fs.FS
26-
skillsFS fs.FS
25+
webFS fs.FS
26+
skillsFS fs.FS
27+
ocrServiceFS fs.FS
2728
)
2829

2930
// Setup must be called at the very start of main(), before any config values
3031
// are read or HTTP requests are served.
31-
func Setup(conf, web, skills fs.FS) {
32+
func Setup(conf, web, skills, ocrService fs.FS) {
3233
webFS = web
3334
skillsFS = skills
35+
ocrServiceFS = ocrService
3436
setupConf(conf)
3537
}
3638

@@ -39,3 +41,6 @@ func WebFS() fs.FS { return webFS }
3941

4042
// SkillsFS returns the embedded skills filesystem, or nil if not available.
4143
func SkillsFS() fs.FS { return skillsFS }
44+
45+
// OcrServiceFS returns the embedded OCR service filesystem, or nil if not available.
46+
func OcrServiceFS() fs.FS { return ocrServiceFS }

0 commit comments

Comments
 (0)