instructlab · jaideepr97 · Apr 8, 2025 · Apr 2, 2025 · booxter · Apr 7, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -122,6 +122,18 @@ jobs:
           rm -rf "$GITHUB_WORKSPACE"/src/github.com/containers/skopeo
           skopeo --version
 
+        # deactivate MPS acceleration on Github CI for MacOS
+        # see https://github.com/actions/runner-images/issues/9918
+      - name: Disable MPS acceleration on MacOS
+        if: startsWith(matrix.platform, 'macos')
+        run: |
+          echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION="true"' >> "$GITHUB_ENV"
+
+      - name: Enable MPS acceleration on non-MacOS runners
+        if: ! startsWith(matrix.platform, 'macos')
+        run: |
+          echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION=' >> "$GITHUB_ENV"
+
       - name: Install tools on MacOS
         if: startsWith(matrix.platform, 'macos')
         run: |

diff --git a/requirements.txt b/requirements.txt
@@ -34,12 +34,15 @@ toml>=0.10.2
 # Default version. Can be overridden in extra requirements
 torch>=2.3.0,<2.6.0
 tqdm>=4.66.2
-transformers>=4.41.2
+# temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab
+# above PR fixes interactions with newer versions of transformers through the training library
+transformers>=4.41.2,<4.51.0
 trl>=0.12.2,<0.15.0
 wandb>=0.16.4
 xdg-base-dirs>=6.0.1
 psutil>=6.0.0
 huggingface_hub[hf_transfer]>=0.1.8
 haystack-ai>=2.8
 docling-core[chunking]>=2.10.0
+docling>=2.18.0
 sentence-transformers>=3.0.0
diff --git a/src/instructlab/rag/convert.py b/src/instructlab/rag/convert.py
@@ -6,7 +6,7 @@
 
 # Standard
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional
 import json
 import logging
 import os
@@ -21,6 +21,7 @@
 from docling.datamodel.pipeline_options import OcrOptions  # type: ignore
 from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore
 from docling.datamodel.pipeline_options import TesseractOcrOptions  # type: ignore
+from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import yaml
 
@@ -175,7 +176,12 @@ def _initialize_docling():
         artifacts_path=docling_model_path,
         do_ocr=False,
     )
-    ocr_options = _resolve_ocr_options()
+
+    if os.getenv("INSTRUCTLAB_DISABLE_GPU_ACCELERATION") == "true":
+        pipeline_options.accelerator_options = AcceleratorOptions(
+            device=AcceleratorDevice.CPU
+        )
+    ocr_options = resolve_ocr_options(docling_model_path=docling_model_path)
     if ocr_options is not None:
         pipeline_options.do_ocr = True
         pipeline_options.ocr_options = ocr_options
@@ -192,13 +198,18 @@ def _initialize_docling():
 
 # Adapted from sdg.utils.chunkers because that code is being refactored so we want to avoid importing anything from it.
 # TODO: Once the code base has settled down, we should make sure this code exists only in one place.
-def _resolve_ocr_options() -> OcrOptions:
+def resolve_ocr_options(
+    docling_model_path: Optional[Path] = None,
+) -> Optional[OcrOptions]:
     """
     Attempts to resolve OCR options for a PDF document. It first tries to use the Tesseract OCR library,
     and if that fails, it tries the EasyOCR library. If neither of these libraries are available, it
     returns None to indicate that OCR will not be used. Note that it imports the OCR libraries inside
     the code if/when they are needed because they are kind of heavy.
     """
+    # Declare ocr_options explicitly as Optional[OcrOptions]
+    ocr_options: Optional[OcrOptions] = None
+
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
@@ -209,25 +220,39 @@ def _resolve_ocr_options() -> OcrOptions:
             TesseractOcrModel,
         )
 
-        _ = TesseractOcrModel(True, ocr_options)
+        _ = TesseractOcrModel(
+            enabled=True,
+            artifacts_path=docling_model_path,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
-        pass
+        logger.warning("Tesseract not found, falling back to EasyOCR.")
     try:
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.models.easyocr_model import (  # type: ignore[import-untyped]
             EasyOcrModel,
         )
 
-        ocr_options = EasyOcrOptions()
-
-        # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = False
+        ocr_options = EasyOcrOptions(
+            lang=["en"],
+            use_gpu=None,
+            confidence_threshold=0.5,
+            model_storage_directory=str(docling_model_path),
+            recog_network="standard",
+            download_enabled=True,
+        )
         # triggers torch loading, import lazily
 
-        _ = EasyOcrModel(True, ocr_options)
+        _ = EasyOcrModel(
+            enabled=True,
+            artifacts_path=None,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR

diff --git a/tests/test_lab_rag_convert.py b/tests/test_lab_rag_convert.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+from io import BytesIO
 from pathlib import Path
 from typing import Dict, Iterable, Iterator, List, Optional, Union
 from unittest.mock import patch
@@ -10,7 +11,6 @@
 from docling.backend.docling_parse_v2_backend import (  # type: ignore  # noqa: F401
     DoclingParseV2DocumentBackend,
 )
-from docling.datamodel.base_models import DocumentStream  # type: ignore  # noqa: F401
 from docling.datamodel.base_models import (  # type: ignore  # noqa: F401
     ConversionStatus,
     InputFormat,
@@ -36,7 +36,7 @@ def __init__(
 
     def convert_all(
         self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # pylint: disable=unused-argument; noqa: ARG002
+        source: Iterable[Union[Path, BytesIO]],  # pylint: disable=unused-argument; noqa: ARG002
         raises_on_error: bool = True,  # pylint: disable=unused-argument; noqa: ARG002
     ) -> Iterator[ConversionResult]:
         # Third Party

diff --git a/tox.ini b/tox.ini
@@ -10,6 +10,7 @@ minversion = 4.4
 description = run tests (unit, unitcov, functional)
 passenv =
     CMAKE_ARGS
+    INSTRUCTLAB_DISABLE_GPU_ACCELERATION
 # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =