From a9862185a5d5c65d2372cc8354cb457395097a6e Mon Sep 17 00:00:00 2001
From: Jaideep Rao <jrao@redhat.com>
Date: Wed, 2 Apr 2025 12:57:13 -0400
Subject: [PATCH] update docling api references

Signed-off-by: Jaideep Rao <jrao@redhat.com>
(cherry picked from commit d4f5f0a1abe8ced5c8b0efee47cbfa65f73a4b2a)
---
 .github/workflows/test.yml     | 12 +++++++++
 requirements.txt               |  5 +++-
 src/instructlab/rag/convert.py | 45 ++++++++++++++++++++++++++--------
 tests/test_lab_rag_convert.py  |  4 +--
 tox.ini                        |  1 +
 5 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7c95fe81da..cf7cf70671 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -123,6 +123,18 @@ jobs:
           rm -rf "$GITHUB_WORKSPACE"/src/github.com/containers/skopeo
           skopeo --version
 
+        # deactivate MPS acceleration on Github CI for MacOS
+        # see https://github.com/actions/runner-images/issues/9918
+      - name: Disable MPS acceleration on MacOS
+        if: startsWith(matrix.platform, 'macos')
+        run: |
+          echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION="true"' >> "$GITHUB_ENV"
+
+      - name: Enable MPS acceleration on non-MacOS runners
+        if: ! startsWith(matrix.platform, 'macos')
+        run: |
+          echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION=' >> "$GITHUB_ENV"
+
       - name: Install tools on MacOS
         if: startsWith(matrix.platform, 'macos')
         run: |
diff --git a/requirements.txt b/requirements.txt
index 1341e9cc0a..82bf79d3f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,7 +32,9 @@ toml>=0.10.2
 # Default version. Can be overridden in extra requirements
 torch>=2.3.0,<2.7.0
 tqdm>=4.66.2
-transformers>=4.41.2
+# temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab
+# above PR fixes interactions with newer versions of transformers through the training library
+transformers>=4.41.2,<4.51.0
 trl>=0.12.2,<0.15.0
 wandb>=0.16.4
 xdg-base-dirs>=6.0.1
@@ -40,4 +42,5 @@ psutil>=6.0.0
 huggingface_hub[hf_transfer]>=0.1.8
 haystack-ai>=2.8
 docling-core[chunking]>=2.10.0
+docling>=2.18.0
 sentence-transformers>=3.0.0
diff --git a/src/instructlab/rag/convert.py b/src/instructlab/rag/convert.py
index 42385c496c..826ba3144f 100644
--- a/src/instructlab/rag/convert.py
+++ b/src/instructlab/rag/convert.py
@@ -6,7 +6,7 @@
 
 # Standard
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional
 import json
 import logging
 import os
@@ -21,6 +21,7 @@
 from docling.datamodel.pipeline_options import OcrOptions  # type: ignore
 from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore
 from docling.datamodel.pipeline_options import TesseractOcrOptions  # type: ignore
+from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import yaml
 
@@ -175,7 +176,12 @@ def _initialize_docling():
         artifacts_path=docling_model_path,
         do_ocr=False,
     )
-    ocr_options = _resolve_ocr_options()
+
+    if os.getenv("INSTRUCTLAB_DISABLE_GPU_ACCELERATION") == "true":
+        pipeline_options.accelerator_options = AcceleratorOptions(
+            device=AcceleratorDevice.CPU
+        )
+    ocr_options = resolve_ocr_options(docling_model_path=docling_model_path)
     if ocr_options is not None:
         pipeline_options.do_ocr = True
         pipeline_options.ocr_options = ocr_options
@@ -192,13 +198,18 @@ def _initialize_docling():
 
 # Adapted from sdg.utils.chunkers because that code is being refactored so we want to avoid importing anything from it.
 # TODO: Once the code base has settled down, we should make sure this code exists only in one place.
-def _resolve_ocr_options() -> OcrOptions:
+def resolve_ocr_options(
+    docling_model_path: Optional[Path] = None,
+) -> Optional[OcrOptions]:
     """
     Attempts to resolve OCR options for a PDF document. It first tries to use the Tesseract OCR library,
     and if that fails, it tries the EasyOCR library. If neither of these libraries are available, it
     returns None to indicate that OCR will not be used. Note that it imports the OCR libraries inside
     the code if/when they are needed because they are kind of heavy.
     """
+    # Declare ocr_options explicitly as Optional[OcrOptions]
+    ocr_options: Optional[OcrOptions] = None
+
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
@@ -209,11 +220,16 @@ def _resolve_ocr_options() -> OcrOptions:
             TesseractOcrModel,
         )
 
-        _ = TesseractOcrModel(True, ocr_options)
+        _ = TesseractOcrModel(
+            enabled=True,
+            artifacts_path=docling_model_path,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
-        pass
+        logger.warning("Tesseract not found, falling back to EasyOCR.")
     try:
         # pylint: disable=import-outside-toplevel
         # Third Party
@@ -221,13 +237,22 @@ def _resolve_ocr_options() -> OcrOptions:
             EasyOcrModel,
         )
 
-        ocr_options = EasyOcrOptions()
-
-        # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = False
+        ocr_options = EasyOcrOptions(
+            lang=["en"],
+            use_gpu=None,
+            confidence_threshold=0.5,
+            model_storage_directory=str(docling_model_path),
+            recog_network="standard",
+            download_enabled=True,
+        )
         # triggers torch loading, import lazily
 
-        _ = EasyOcrModel(True, ocr_options)
+        _ = EasyOcrModel(
+            enabled=True,
+            artifacts_path=None,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR
diff --git a/tests/test_lab_rag_convert.py b/tests/test_lab_rag_convert.py
index 416138ef17..fe11d2cf35 100644
--- a/tests/test_lab_rag_convert.py
+++ b/tests/test_lab_rag_convert.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+from io import BytesIO
 from pathlib import Path
 from typing import Dict, Iterable, Iterator, List, Optional, Union
 from unittest.mock import patch
@@ -10,7 +11,6 @@
 from docling.backend.docling_parse_v2_backend import (  # type: ignore  # noqa: F401
     DoclingParseV2DocumentBackend,
 )
-from docling.datamodel.base_models import DocumentStream  # type: ignore  # noqa: F401
 from docling.datamodel.base_models import (  # type: ignore  # noqa: F401
     ConversionStatus,
     InputFormat,
@@ -36,7 +36,7 @@ def __init__(
 
     def convert_all(
         self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # pylint: disable=unused-argument; noqa: ARG002
+        source: Iterable[Union[Path, BytesIO]],  # pylint: disable=unused-argument; noqa: ARG002
         raises_on_error: bool = True,  # pylint: disable=unused-argument; noqa: ARG002
     ) -> Iterator[ConversionResult]:
         # Third Party
diff --git a/tox.ini b/tox.ini
index 2731d9d091..a8bfaa08a4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,6 +10,7 @@ minversion = 4.4
 description = run tests (unit, unitcov, functional)
 passenv =
     CMAKE_ARGS
+    INSTRUCTLAB_DISABLE_GPU_ACCELERATION
 # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =