From a9862185a5d5c65d2372cc8354cb457395097a6e Mon Sep 17 00:00:00 2001 From: Jaideep Rao Date: Wed, 2 Apr 2025 12:57:13 -0400 Subject: [PATCH] update docling api references Signed-off-by: Jaideep Rao (cherry picked from commit d4f5f0a1abe8ced5c8b0efee47cbfa65f73a4b2a) --- .github/workflows/test.yml | 12 +++++++++ requirements.txt | 5 +++- src/instructlab/rag/convert.py | 45 ++++++++++++++++++++++++++-------- tests/test_lab_rag_convert.py | 4 +-- tox.ini | 1 + 5 files changed, 54 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7c95fe81da..cf7cf70671 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -123,6 +123,18 @@ jobs: rm -rf "$GITHUB_WORKSPACE"/src/github.com/containers/skopeo skopeo --version + # deactivate MPS acceleration on Github CI for MacOS + # see https://github.com/actions/runner-images/issues/9918 + - name: Disable MPS acceleration on MacOS + if: startsWith(matrix.platform, 'macos') + run: | + echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION="true"' >> "$GITHUB_ENV" + + - name: Enable MPS acceleration on non-MacOS runners + if: ! startsWith(matrix.platform, 'macos') + run: | + echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION=' >> "$GITHUB_ENV" + - name: Install tools on MacOS if: startsWith(matrix.platform, 'macos') run: | diff --git a/requirements.txt b/requirements.txt index 1341e9cc0a..82bf79d3f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,9 @@ toml>=0.10.2 # Default version. Can be overridden in extra requirements torch>=2.3.0,<2.7.0 tqdm>=4.66.2 -transformers>=4.41.2 +# temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab +# above PR fixes interactions with newer versions of transformers through the training library +transformers>=4.41.2,<4.51.0 trl>=0.12.2,<0.15.0 wandb>=0.16.4 xdg-base-dirs>=6.0.1 @@ -40,4 +42,5 @@ psutil>=6.0.0 huggingface_hub[hf_transfer]>=0.1.8 haystack-ai>=2.8 docling-core[chunking]>=2.10.0 +docling>=2.18.0 sentence-transformers>=3.0.0 diff --git a/src/instructlab/rag/convert.py b/src/instructlab/rag/convert.py index 42385c496c..826ba3144f 100644 --- a/src/instructlab/rag/convert.py +++ b/src/instructlab/rag/convert.py @@ -6,7 +6,7 @@ # Standard from pathlib import Path -from typing import Iterable +from typing import Iterable, Optional import json import logging import os @@ -21,6 +21,7 @@ from docling.datamodel.pipeline_options import OcrOptions # type: ignore from docling.datamodel.pipeline_options import PdfPipelineOptions # type: ignore from docling.datamodel.pipeline_options import TesseractOcrOptions # type: ignore +from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions from xdg_base_dirs import xdg_data_dirs, xdg_data_home import yaml @@ -175,7 +176,12 @@ def _initialize_docling(): artifacts_path=docling_model_path, do_ocr=False, ) - ocr_options = _resolve_ocr_options() + + if os.getenv("INSTRUCTLAB_DISABLE_GPU_ACCELERATION") == "true": + pipeline_options.accelerator_options = AcceleratorOptions( + device=AcceleratorDevice.CPU + ) + ocr_options = resolve_ocr_options(docling_model_path=docling_model_path) if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options @@ -192,13 +198,18 @@ def _initialize_docling(): # Adapted from sdg.utils.chunkers because that code is being refactored so we want to avoid importing anything from it. # TODO: Once the code base has settled down, we should make sure this code exists only in one place. -def _resolve_ocr_options() -> OcrOptions: +def resolve_ocr_options( + docling_model_path: Optional[Path] = None, +) -> Optional[OcrOptions]: """ Attempts to resolve OCR options for a PDF document. It first tries to use the Tesseract OCR library, and if that fails, it tries the EasyOCR library. If neither of these libraries are available, it returns None to indicate that OCR will not be used. Note that it imports the OCR libraries inside the code if/when they are needed because they are kind of heavy. """ + # Declare ocr_options explicitly as Optional[OcrOptions] + ocr_options: Optional[OcrOptions] = None + # First, attempt to use tesserocr try: ocr_options = TesseractOcrOptions() @@ -209,11 +220,16 @@ def _resolve_ocr_options() -> OcrOptions: TesseractOcrModel, ) - _ = TesseractOcrModel(True, ocr_options) + _ = TesseractOcrModel( + enabled=True, + artifacts_path=docling_model_path, + options=ocr_options, + accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU), + ) return ocr_options except ImportError: # No tesserocr, so try something else - pass + logger.warning("Tesseract not found, falling back to EasyOCR.") try: # pylint: disable=import-outside-toplevel # Third Party @@ -221,13 +237,22 @@ def _resolve_ocr_options() -> OcrOptions: EasyOcrModel, ) - ocr_options = EasyOcrOptions() - - # Keep easyocr models on the CPU instead of GPU - ocr_options.use_gpu = False + ocr_options = EasyOcrOptions( + lang=["en"], + use_gpu=None, + confidence_threshold=0.5, + model_storage_directory=str(docling_model_path), + recog_network="standard", + download_enabled=True, + ) # triggers torch loading, import lazily - _ = EasyOcrModel(True, ocr_options) + _ = EasyOcrModel( + enabled=True, + artifacts_path=None, + options=ocr_options, + accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU), + ) return ocr_options except ImportError: # no easyocr either, so don't use any OCR diff --git a/tests/test_lab_rag_convert.py b/tests/test_lab_rag_convert.py index 416138ef17..fe11d2cf35 100644 --- a/tests/test_lab_rag_convert.py +++ b/tests/test_lab_rag_convert.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from io import BytesIO from pathlib import Path from typing import Dict, Iterable, Iterator, List, Optional, Union from unittest.mock import patch @@ -10,7 +11,6 @@ from docling.backend.docling_parse_v2_backend import ( # type: ignore # noqa: F401 DoclingParseV2DocumentBackend, ) -from docling.datamodel.base_models import DocumentStream # type: ignore # noqa: F401 from docling.datamodel.base_models import ( # type: ignore # noqa: F401 ConversionStatus, InputFormat, @@ -36,7 +36,7 @@ def __init__( def convert_all( self, - source: Iterable[Union[Path, str, DocumentStream]], # pylint: disable=unused-argument; noqa: ARG002 + source: Iterable[Union[Path, BytesIO]], # pylint: disable=unused-argument; noqa: ARG002 raises_on_error: bool = True, # pylint: disable=unused-argument; noqa: ARG002 ) -> Iterator[ConversionResult]: # Third Party diff --git a/tox.ini b/tox.ini index 2731d9d091..a8bfaa08a4 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ minversion = 4.4 description = run tests (unit, unitcov, functional) passenv = CMAKE_ARGS + INSTRUCTLAB_DISABLE_GPU_ACCELERATION # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies # are huge. This reduces venv from 5.7 GB to 1.5 GB. setenv =