Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ jobs:
rm -rf "$GITHUB_WORKSPACE"/src/github.com/containers/skopeo
skopeo --version

# deactivate MPS acceleration on Github CI for MacOS
# see https://github.com/actions/runner-images/issues/9918
- name: Disable MPS acceleration on MacOS
if: startsWith(matrix.platform, 'macos')
run: |
echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION="true"' >> "$GITHUB_ENV"

- name: Enable MPS acceleration on non-MacOS runners
if: ! startsWith(matrix.platform, 'macos')
run: |
echo 'INSTRUCTLAB_DISABLE_GPU_ACCELERATION=' >> "$GITHUB_ENV"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: probably not necessary to set it to empty string here since env.get() will return None if it's not set, which will still not pass the == true check.


- name: Install tools on MacOS
if: startsWith(matrix.platform, 'macos')
run: |
Expand Down
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ toml>=0.10.2
# Default version. Can be overridden in extra requirements
torch>=2.3.0,<2.6.0
tqdm>=4.66.2
transformers>=4.41.2
# temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab
# above PR fixes interactions with newer versions of transformers through the training library
transformers>=4.41.2,<4.51.0
trl>=0.12.2,<0.15.0
wandb>=0.16.4
xdg-base-dirs>=6.0.1
psutil>=6.0.0
huggingface_hub[hf_transfer]>=0.1.8
haystack-ai>=2.8
docling-core[chunking]>=2.10.0
docling>=2.18.0
sentence-transformers>=3.0.0
45 changes: 35 additions & 10 deletions src/instructlab/rag/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Standard
from pathlib import Path
from typing import Iterable
from typing import Iterable, Optional
import json
import logging
import os
Expand All @@ -21,6 +21,7 @@
from docling.datamodel.pipeline_options import OcrOptions # type: ignore
from docling.datamodel.pipeline_options import PdfPipelineOptions # type: ignore
from docling.datamodel.pipeline_options import TesseractOcrOptions # type: ignore
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
import yaml

Expand Down Expand Up @@ -175,7 +176,12 @@ def _initialize_docling():
artifacts_path=docling_model_path,
do_ocr=False,
)
ocr_options = _resolve_ocr_options()

if os.getenv("INSTRUCTLAB_DISABLE_GPU_ACCELERATION") == "true":
pipeline_options.accelerator_options = AcceleratorOptions(
device=AcceleratorDevice.CPU
)
ocr_options = resolve_ocr_options(docling_model_path=docling_model_path)
if ocr_options is not None:
pipeline_options.do_ocr = True
pipeline_options.ocr_options = ocr_options
Expand All @@ -192,13 +198,18 @@ def _initialize_docling():

# Adapted from sdg.utils.chunkers because that code is being refactored so we want to avoid importing anything from it.
# TODO: Once the code base has settled down, we should make sure this code exists only in one place.
def _resolve_ocr_options() -> OcrOptions:
def resolve_ocr_options(
docling_model_path: Optional[Path] = None,
) -> Optional[OcrOptions]:
"""
Attempts to resolve OCR options for a PDF document. It first tries to use the Tesseract OCR library,
and if that fails, it tries the EasyOCR library. If neither of these libraries are available, it
returns None to indicate that OCR will not be used. Note that it imports the OCR libraries inside
the code if/when they are needed because they are kind of heavy.
"""
# Declare ocr_options explicitly as Optional[OcrOptions]
ocr_options: Optional[OcrOptions] = None

# First, attempt to use tesserocr
try:
ocr_options = TesseractOcrOptions()
Expand All @@ -209,25 +220,39 @@ def _resolve_ocr_options() -> OcrOptions:
TesseractOcrModel,
)

_ = TesseractOcrModel(True, ocr_options)
_ = TesseractOcrModel(
enabled=True,
artifacts_path=docling_model_path,
options=ocr_options,
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
)
return ocr_options
except ImportError:
# No tesserocr, so try something else
pass
logger.warning("Tesseract not found, falling back to EasyOCR.")
try:
# pylint: disable=import-outside-toplevel
# Third Party
from docling.models.easyocr_model import ( # type: ignore[import-untyped]
EasyOcrModel,
)

ocr_options = EasyOcrOptions()

# Keep easyocr models on the CPU instead of GPU
ocr_options.use_gpu = False
ocr_options = EasyOcrOptions(
lang=["en"],
use_gpu=None,
confidence_threshold=0.5,
model_storage_directory=str(docling_model_path),
recog_network="standard",
download_enabled=True,
)
# triggers torch loading, import lazily

_ = EasyOcrModel(True, ocr_options)
_ = EasyOcrModel(
enabled=True,
artifacts_path=None,
options=ocr_options,
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
)
return ocr_options
except ImportError:
# no easyocr either, so don't use any OCR
Expand Down
4 changes: 2 additions & 2 deletions tests/test_lab_rag_convert.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Union
from unittest.mock import patch
Expand All @@ -10,7 +11,6 @@
from docling.backend.docling_parse_v2_backend import ( # type: ignore # noqa: F401
DoclingParseV2DocumentBackend,
)
from docling.datamodel.base_models import DocumentStream # type: ignore # noqa: F401
from docling.datamodel.base_models import ( # type: ignore # noqa: F401
ConversionStatus,
InputFormat,
Expand All @@ -36,7 +36,7 @@ def __init__(

def convert_all(
self,
source: Iterable[Union[Path, str, DocumentStream]], # pylint: disable=unused-argument; noqa: ARG002
source: Iterable[Union[Path, BytesIO]], # pylint: disable=unused-argument; noqa: ARG002
raises_on_error: bool = True, # pylint: disable=unused-argument; noqa: ARG002
) -> Iterator[ConversionResult]:
# Third Party
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ minversion = 4.4
description = run tests (unit, unitcov, functional)
passenv =
CMAKE_ARGS
INSTRUCTLAB_DISABLE_GPU_ACCELERATION
# Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
# are huge. This reduces venv from 5.7 GB to 1.5 GB.
setenv =
Expand Down