diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81a9f20cfb..94e2d08612 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -345,6 +345,7 @@ jobs:
PYTHONPATH: ${{ github.workspace }}
run: |
source .venv/bin/activate
+ sudo apt-get install diffstat
./test_unstructured_ingest/check-diff-expected-output-html.sh
test_unstructured_api_unit:
diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml
index 317f46ec0c..33402ae260 100644
--- a/.github/workflows/ingest-test-fixtures-update-pr.yml
+++ b/.github/workflows/ingest-test-fixtures-update-pr.yml
@@ -139,6 +139,7 @@ jobs:
token: ${{ secrets.GH_CREATE_PR_TOKEN }}
add-paths: |
test_unstructured_ingest/expected-structured-output
+ test_unstructured_ingest/expected-structured-output-html
test_unstructured_ingest/metrics
commit-message: "Update ingest test fixtures"
branch: ${{ env.BRANCH_NAME }}
diff --git a/.gitignore b/.gitignore
index e8e4471465..87f4fc72bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,4 +208,5 @@ outputhtmldiff.txt
metricsdiff.txt
# analysis
-annotated/
\ No newline at end of file
+annotated/
+.aider*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa47187bdc..20a4bcaf71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,65 @@
+## 0.17.7-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image.
+
+## 0.17.6
+
+### Enhancements
+
+### Features
+
+### Fixes
+- The sort_page_element() use the element id to sort the elements.
+Two executions of the same code, on the same file, produce different results. The order of the elements is random.
+This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
+- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
+- Resolve open CVEs
+- Properly handle the case when an element's `text` attribute is None
+
+
+## 0.17.5
+
+### Enhancements
+- **Remove test and dev dependencies from docker image.** This reduces the docker image size slightly and reduces potential security vulnerabilities.
+
+### Features
+
+### Fixes
+- **Removed out of date ubuntu Dockerfile.** The Dockerfile was out of date and non-functional.
+- **Fix for 'PSSyntaxError' import error: "cannot import name 'PSSyntaxError' from 'pdfminer.pdfparser'"** PSSyntaxError needed to be imported from its source 'pdfminer.psexceptions'.
+
+## 0.17.4
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Deprecate `stage_for_label_studio` and drop `label_studio_sdk` dependency.** This resolves a CVE due to the dependency on `label_studio_sdk`.
+
+## 0.17.3
+
+### Enhancements
+
+### Features
+
+### Fixes
+- Resolve open CVEs
+
+## 0.17.3-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
+
## 0.17.2
* Fix Image in a
tag is "UncategorizedText" with no .text
@@ -93,6 +155,7 @@
### Fixes
- **Fix file type detection for NDJSON files** NDJSON files were being detected as JSON due to having the same mime-type.
+- Base-image was updated to resolved CVEs, running pipline to manually build
## 0.16.20
diff --git a/Dockerfile b/Dockerfile
index 69b96d3e67..e4d7ebd5be 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data
# Install Python dependencies and download required NLTK packages
-RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
+RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
mkdir -p ${NLTK_DATA} && \
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
diff --git a/Makefile b/Makefile
index c5208c365c..fe1350d5f5 100644
--- a/Makefile
+++ b/Makefile
@@ -310,7 +310,8 @@ docker-test:
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
$(DOCKER_IMAGE) \
- bash -c "CI=$(CI) \
+ bash -c "pip install -r requirements/test.txt -r requirements/dev.txt && \
+ CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
@@ -339,4 +340,5 @@ run-jupyter:
.PHONY: html-fixtures-update
html-fixtures-update:
+ rm -r test_unstructured_ingest/expected-structured-output-html && \
test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
diff --git a/docker/rockylinux-9.2/Dockerfile b/docker/rockylinux-9.2/Dockerfile
index 3bce864e37..051294dc96 100644
--- a/docker/rockylinux-9.2/Dockerfile
+++ b/docker/rockylinux-9.2/Dockerfile
@@ -22,7 +22,7 @@ COPY requirements requirements
RUN python3.10 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
- find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \
+ find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \
dnf -y groupremove "Development Tools" && \
dnf clean all
diff --git a/docker/ubuntu-22/Dockerfile b/docker/ubuntu-22/Dockerfile
deleted file mode 100644
index 059bfc85bb..0000000000
--- a/docker/ubuntu-22/Dockerfile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Dockerfile that approximates the CI image
-#
-# Mainly useful for updating test-ingest fixtures
-
-FROM ubuntu:22.04
-
-COPY scripts/setup_ubuntu.sh scripts/setup_ubuntu.sh
-
-RUN bash scripts/setup_ubuntu.sh root
-
-COPY requirements/ requirements/
-COPY Makefile Makefile
-
-SHELL ["/bin/bash", "-c"]
-
-RUN source ~/.bashrc && pyenv virtualenv 3.10 unstructured && \
- source ~/.pyenv/versions/unstructured/bin/activate && \
- make install-ci && \
- make install-ingest-s3 && \
- make install-ingest-azure && \
- make install-ingest-github && \
- make install-ingest-gitlab && \
- make install-ingest-wikipedia && \
- make install-ingest-discord && \
- make install install-ingest-slack && \
- make install-ingest-confluence
diff --git a/requirements/base.txt b/requirements/base.txt
index 17a25c4d40..862ed52ff9 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -8,9 +8,9 @@ anyio==4.9.0
# via httpx
backoff==2.2.1
# via -r ./base.in
-beautifulsoup4==4.13.3
+beautifulsoup4==4.13.4
# via -r ./base.in
-certifi==2025.1.31
+certifi==2025.4.26
# via
# httpcore
# httpx
@@ -42,11 +42,11 @@ exceptiongroup==1.2.2
# via anyio
filetype==1.2.0
# via -r ./base.in
-h11==0.14.0
+h11==0.16.0
# via httpcore
html5lib==1.1
# via -r ./base.in
-httpcore==1.0.7
+httpcore==1.0.9
# via httpx
httpx==0.28.1
# via unstructured-client
@@ -62,13 +62,13 @@ jsonpath-python==1.0.6
# via unstructured-client
langdetect==1.0.9
# via -r ./base.in
-lxml==5.3.1
+lxml==5.4.0
# via -r ./base.in
marshmallow==3.26.1
# via
# dataclasses-json
# unstructured-client
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
# via
# typing-inspect
# unstructured-client
@@ -80,9 +80,9 @@ numpy==2.0.2
# via -r ./base.in
olefile==0.47
# via python-oxmsg
-orderly-set==5.3.0
+orderly-set==5.4.0
# via deepdiff
-packaging==24.2
+packaging==25.0
# via
# marshmallow
# unstructured-client
@@ -100,7 +100,7 @@ python-magic==0.4.27
# via -r ./base.in
python-oxmsg==0.0.2
# via -r ./base.in
-rapidfuzz==3.12.2
+rapidfuzz==3.13.0
# via -r ./base.in
regex==2024.11.6
# via nltk
@@ -119,13 +119,13 @@ six==1.17.0
# unstructured-client
sniffio==1.3.1
# via anyio
-soupsieve==2.6
+soupsieve==2.7
# via beautifulsoup4
tqdm==4.67.1
# via
# -r ./base.in
# nltk
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -r ./base.in
# anyio
diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt
index be1d0c40fd..9659e8bac1 100644
--- a/requirements/deps/constraints.txt
+++ b/requirements/deps/constraints.txt
@@ -22,3 +22,5 @@ importlib-metadata>=8.5.0
unstructured-client>=0.23.0,<0.26.0
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
protobuf>=6.30.0
+# (yao) issues with pdfminer-six above 20250416
+pdfminer.six<20250416
\ No newline at end of file
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 0de6c4eb02..b42ff70e01 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -17,7 +17,7 @@ distlib==0.3.9
# via virtualenv
filelock==3.18.0
# via virtualenv
-identify==2.6.9
+identify==2.6.10
# via pre-commit
importlib-metadata==8.6.1
# via
@@ -25,33 +25,31 @@ importlib-metadata==8.6.1
# build
nodeenv==1.9.1
# via pre-commit
-packaging==24.2
+packaging==25.0
# via
# -c ./base.txt
# -c ./test.txt
# build
pip-tools==7.4.1
# via -r ./dev.in
-platformdirs==4.3.6
+platformdirs==4.3.7
# via
# -c ./test.txt
# virtualenv
-pre-commit==4.1.0
+pre-commit==4.2.0
# via -r ./dev.in
pyproject-hooks==1.2.0
# via
# build
# pip-tools
pyyaml==6.0.2
- # via
- # -c ./test.txt
- # pre-commit
+ # via pre-commit
tomli==2.2.1
# via
# -c ./test.txt
# build
# pip-tools
-virtualenv==20.29.3
+virtualenv==20.30.0
# via pre-commit
wheel==0.45.1
# via pip-tools
diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt
index a5779f0a87..51885ae7ad 100644
--- a/requirements/extra-csv.txt
+++ b/requirements/extra-csv.txt
@@ -14,11 +14,11 @@ python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# pandas
-pytz==2025.1
+pytz==2025.2
# via pandas
six==1.17.0
# via
# -c ./base.txt
# python-dateutil
-tzdata==2025.1
+tzdata==2025.2
# via pandas
diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt
index 7cdf55c7a7..f31b78b82a 100644
--- a/requirements/extra-docx.txt
+++ b/requirements/extra-docx.txt
@@ -4,13 +4,13 @@
#
# pip-compile ./extra-docx.in
#
-lxml==5.3.1
+lxml==5.4.0
# via
# -c ./base.txt
# python-docx
python-docx==1.1.2
# via -r ./extra-docx.in
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
# python-docx
diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt
index 9d0a14da55..2311bce60f 100644
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@@ -8,7 +8,7 @@ importlib-metadata==8.6.1
# via
# -c ././deps/constraints.txt
# markdown
-markdown==3.7
+markdown==3.8
# via -r ./extra-markdown.in
zipp==3.21.0
# via importlib-metadata
diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt
index a157708ebd..ced65cd542 100644
--- a/requirements/extra-odt.txt
+++ b/requirements/extra-odt.txt
@@ -4,7 +4,7 @@
#
# pip-compile ./extra-odt.in
#
-lxml==5.3.1
+lxml==5.4.0
# via
# -c ./base.txt
# python-docx
@@ -12,7 +12,7 @@ pypandoc==1.15
# via -r ./extra-odt.in
python-docx==1.1.2
# via -r ./extra-odt.in
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
# python-docx
diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
index a5264d7840..df43fc8f9b 100644
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@@ -18,11 +18,11 @@ anyio==4.9.0
# httpx
astor==0.8.1
# via paddlepaddle
-beautifulsoup4==4.13.3
+beautifulsoup4==4.13.4
# via
# -c ./base.txt
# unstructured-paddleocr
-certifi==2025.1.31
+certifi==2025.4.26
# via
# -c ./base.txt
# httpcore
@@ -44,13 +44,13 @@ exceptiongroup==1.2.2
# anyio
fire==0.7.0
# via unstructured-paddleocr
-fonttools==4.56.0
+fonttools==4.57.0
# via unstructured-paddleocr
-h11==0.14.0
+h11==0.16.0
# via
# -c ./base.txt
# httpcore
-httpcore==1.0.7
+httpcore==1.0.9
# via
# -c ./base.txt
# httpx
@@ -68,7 +68,7 @@ imageio==2.37.0
# via scikit-image
lazy-loader==0.4
# via scikit-image
-lxml==5.3.1
+lxml==5.4.0
# via
# -c ./base.txt
# python-docx
@@ -102,28 +102,28 @@ opencv-python-headless==4.11.0.86
# albumentations
opt-einsum==3.3.0
# via paddlepaddle
-packaging==24.2
+packaging==25.0
# via
# -c ./base.txt
# lazy-loader
# scikit-image
-paddlepaddle==3.0.0rc1
+paddlepaddle==3.0.0
# via -r ./extra-paddleocr.in
-pillow==11.1.0
+pillow==11.2.1
# via
# imageio
# paddlepaddle
# scikit-image
# unstructured-paddleocr
-protobuf==6.30.1
+protobuf==6.30.2
# via
# -c ././deps/constraints.txt
# paddlepaddle
pyclipper==1.3.0.post6
# via unstructured-paddleocr
-pydantic==2.10.6
+pydantic==2.11.3
# via albumentations
-pydantic-core==2.27.2
+pydantic-core==2.33.1
# via pydantic
python-docx==1.1.2
# via unstructured-paddleocr
@@ -131,7 +131,7 @@ pyyaml==6.0.2
# via
# albumentations
# unstructured-paddleocr
-rapidfuzz==3.12.2
+rapidfuzz==3.13.0
# via
# -c ./base.txt
# unstructured-paddleocr
@@ -153,13 +153,13 @@ sniffio==1.3.1
# via
# -c ./base.txt
# anyio
-soupsieve==2.6
+soupsieve==2.7
# via
# -c ./base.txt
# beautifulsoup4
-stringzilla==3.12.3
+stringzilla==3.12.5
# via albucore
-termcolor==2.5.0
+termcolor==3.0.1
# via fire
tifffile==2024.8.30
# via scikit-image
@@ -167,7 +167,7 @@ tqdm==4.67.1
# via
# -c ./base.txt
# unstructured-paddleocr
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
# albucore
@@ -178,6 +178,9 @@ typing-extensions==4.12.2
# pydantic
# pydantic-core
# python-docx
+ # typing-inspection
+typing-inspection==0.4.0
+ # via pydantic
unstructured-paddleocr==2.10.0
# via -r ./extra-paddleocr.in
urllib3==1.26.20
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 0226cee3e6..367924c7d6 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3
# via omegaconf
cachetools==5.5.2
# via google-auth
-certifi==2025.1.31
+certifi==2025.4.26
# via
# -c ./base.txt
# requests
@@ -42,21 +42,21 @@ filelock==3.18.0
# transformers
flatbuffers==25.2.10
# via onnxruntime
-fonttools==4.56.0
+fonttools==4.57.0
# via matplotlib
-fsspec==2025.3.0
+fsspec==2025.3.2
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.24.2
# via google-cloud-vision
-google-auth==2.38.0
+google-auth==2.39.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==3.10.1
# via -r ./extra-pdf-image.in
-googleapis-common-protos==1.69.2
+googleapis-common-protos==1.70.0
# via
# google-api-core
# grpcio-status
@@ -67,7 +67,7 @@ grpcio==1.71.0
# grpcio-status
grpcio-status==1.62.3
# via google-api-core
-huggingface-hub==0.29.3
+huggingface-hub==0.30.2
# via
# timm
# tokenizers
@@ -85,7 +85,7 @@ jinja2==3.1.6
# via torch
kiwisolver==1.4.7
# via matplotlib
-lxml==5.3.1
+lxml==5.4.0
# via
# -c ./base.txt
# pikepdf
@@ -125,7 +125,7 @@ onnxruntime==1.19.2
# unstructured-inference
opencv-python==4.11.0.86
# via unstructured-inference
-packaging==24.2
+packaging==25.0
# via
# -c ./base.txt
# huggingface-hub
@@ -138,15 +138,16 @@ pandas==2.2.3
# via unstructured-inference
pdf2image==1.17.0
# via -r ./extra-pdf-image.in
-pdfminer-six==20240706
+pdfminer-six==20250327
# via
+ # -c ././deps/constraints.txt
# -r ./extra-pdf-image.in
# unstructured-inference
pi-heif==0.22.0
# via -r ./extra-pdf-image.in
-pikepdf==9.5.2
+pikepdf==9.7.0
# via -r ./extra-pdf-image.in
-pillow==11.1.0
+pillow==11.2.1
# via
# matplotlib
# pdf2image
@@ -158,7 +159,7 @@ proto-plus==1.26.1
# via
# google-api-core
# google-cloud-vision
-protobuf==6.30.1
+protobuf==6.30.2
# via
# -c ././deps/constraints.txt
# google-api-core
@@ -172,7 +173,7 @@ pyasn1==0.6.1
# via
# pyasn1-modules
# rsa
-pyasn1-modules==0.4.1
+pyasn1-modules==0.4.2
# via google-auth
pycocotools==2.0.8
# via effdet
@@ -180,7 +181,7 @@ pycparser==2.22
# via
# -c ./base.txt
# cffi
-pyparsing==3.2.1
+pyparsing==3.2.3
# via matplotlib
pypdf==5.4.0
# via
@@ -195,7 +196,7 @@ python-dateutil==2.9.0.post0
# pandas
python-multipart==0.0.20
# via unstructured-inference
-pytz==2025.1
+pytz==2025.2
# via pandas
pyyaml==6.0.2
# via
@@ -203,7 +204,7 @@ pyyaml==6.0.2
# omegaconf
# timm
# transformers
-rapidfuzz==3.12.2
+rapidfuzz==3.13.0
# via
# -c ./base.txt
# unstructured-inference
@@ -217,7 +218,7 @@ requests==2.32.3
# google-api-core
# huggingface-hub
# transformers
-rsa==4.9
+rsa==4.9.1
# via google-auth
safetensors==0.5.3
# via
@@ -229,7 +230,7 @@ six==1.17.0
# via
# -c ./base.txt
# python-dateutil
-sympy==1.13.1
+sympy==1.13.3
# via
# onnxruntime
# torch
@@ -241,13 +242,13 @@ tokenizers==0.21.1
# via
# -c ././deps/constraints.txt
# transformers
-torch==2.6.0
+torch==2.7.0
# via
# effdet
# timm
# torchvision
# unstructured-inference
-torchvision==0.21.0
+torchvision==0.22.0
# via
# effdet
# timm
@@ -256,15 +257,15 @@ tqdm==4.67.1
# -c ./base.txt
# huggingface-hub
# transformers
-transformers==4.49.0
+transformers==4.51.3
# via unstructured-inference
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
# huggingface-hub
# pypdf
# torch
-tzdata==2025.1
+tzdata==2025.2
# via pandas
unstructured-inference==0.8.10
# via -r ./extra-pdf-image.in
diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt
index 41b37f70f0..7ec19718d8 100644
--- a/requirements/extra-pptx.txt
+++ b/requirements/extra-pptx.txt
@@ -4,13 +4,13 @@
#
# pip-compile ./extra-pptx.in
#
-lxml==5.3.1
+lxml==5.4.0
# via python-pptx
-pillow==11.1.0
+pillow==11.2.1
# via python-pptx
python-pptx==1.0.2
# via -r ./extra-pptx.in
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via python-pptx
-xlsxwriter==3.2.2
+xlsxwriter==3.2.3
# via python-pptx
diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt
index 895935708c..937191502d 100644
--- a/requirements/extra-xlsx.txt
+++ b/requirements/extra-xlsx.txt
@@ -20,13 +20,13 @@ python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# pandas
-pytz==2025.1
+pytz==2025.2
# via pandas
six==1.17.0
# via
# -c ./base.txt
# python-dateutil
-tzdata==2025.1
+tzdata==2025.2
# via pandas
xlrd==2.0.1
# via -r ./extra-xlsx.in
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
index 829a0448d4..a7c793c739 100644
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@@ -4,7 +4,7 @@
#
# pip-compile ./huggingface.in
#
-certifi==2025.1.31
+certifi==2025.4.26
# via
# -c ./base.txt
# requests
@@ -21,11 +21,11 @@ filelock==3.18.0
# huggingface-hub
# torch
# transformers
-fsspec==2025.3.0
+fsspec==2025.3.2
# via
# huggingface-hub
# torch
-huggingface-hub==0.29.3
+huggingface-hub==0.30.2
# via
# tokenizers
# transformers
@@ -53,7 +53,7 @@ numpy==2.0.2
# via
# -c ./base.txt
# transformers
-packaging==24.2
+packaging==25.0
# via
# -c ./base.txt
# huggingface-hub
@@ -82,13 +82,13 @@ six==1.17.0
# via
# -c ./base.txt
# langdetect
-sympy==1.13.1
+sympy==1.13.3
# via torch
tokenizers==0.21.1
# via
# -c ././deps/constraints.txt
# transformers
-torch==2.6.0
+torch==2.7.0
# via -r ./huggingface.in
tqdm==4.67.1
# via
@@ -96,9 +96,9 @@ tqdm==4.67.1
# huggingface-hub
# sacremoses
# transformers
-transformers==4.49.0
+transformers==4.51.3
# via -r ./huggingface.in
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
# huggingface-hub
diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt
index 6c99d3cfcd..364f499029 100644
--- a/requirements/ingest/ingest.txt
+++ b/requirements/ingest/ingest.txt
@@ -1,4 +1,4 @@
-unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1
+unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1
s3fs>=2024.9.0
urllib3>=1.26.20
backoff>=2.2.1
diff --git a/requirements/test.in b/requirements/test.in
index ca9d2d5bfe..e9b8fadbf8 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -6,7 +6,6 @@ types-click
flake8
flake8-print
freezegun
-label_studio_sdk
mypy
pydantic
pytest-cov
@@ -15,7 +14,6 @@ ruff
types-Markdown
types-requests
types-tabulate
-vcrpy
grpcio
autoflake
liccheck
diff --git a/requirements/test.txt b/requirements/test.txt
index b64b5d52f5..2706ac725c 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -6,43 +6,23 @@
#
annotated-types==0.7.0
# via pydantic
-anyio==4.9.0
- # via
- # -c ./base.txt
- # httpx
-appdirs==1.4.4
- # via label-studio-sdk
-attrs==25.3.0
- # via jsonschema
autoflake==2.3.1
# via -r ./test.in
black==25.1.0
# via -r ./test.in
-certifi==2025.1.31
- # via
- # -c ./base.txt
- # httpcore
- # httpx
- # requests
-charset-normalizer==3.4.1
- # via
- # -c ./base.txt
- # requests
click==8.1.8
# via
# -c ./base.txt
# black
- # nltk
-coverage[toml]==7.7.0
+coverage[toml]==7.8.0
# via
# -r ./test.in
# pytest-cov
exceptiongroup==1.2.2
# via
# -c ./base.txt
- # anyio
# pytest
-flake8==7.1.2
+flake8==7.2.0
# via
# -r ./test.in
# flake8-print
@@ -54,100 +34,47 @@ grpcio==1.71.0
# via
# -c ././deps/constraints.txt
# -r ./test.in
-h11==0.14.0
- # via
- # -c ./base.txt
- # httpcore
-httpcore==1.0.7
- # via
- # -c ./base.txt
- # httpx
-httpx==0.28.1
- # via
- # -c ./base.txt
- # label-studio-sdk
-idna==3.10
- # via
- # -c ./base.txt
- # anyio
- # httpx
- # requests
- # yarl
-ijson==3.3.0
- # via label-studio-sdk
-iniconfig==2.0.0
+iniconfig==2.1.0
# via pytest
-joblib==1.4.2
- # via
- # -c ./base.txt
- # nltk
-jsonschema==3.2.0
- # via label-studio-sdk
-label-studio-sdk==1.0.5
- # via -r ./test.in
liccheck==0.9.2
# via -r ./test.in
-lxml==5.3.1
- # via
- # -c ./base.txt
- # label-studio-sdk
mccabe==0.7.0
# via flake8
-multidict==6.2.0
- # via yarl
mypy==1.15.0
# via -r ./test.in
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
# via
# -c ./base.txt
# black
# mypy
-nltk==3.9.1
- # via
- # -c ./base.txt
- # label-studio-sdk
-numpy==2.0.2
- # via
- # -c ./base.txt
- # pandas
-packaging==24.2
+packaging==25.0
# via
# -c ./base.txt
# black
# pytest
-pandas==2.2.3
- # via label-studio-sdk
pathspec==0.12.1
# via black
-pillow==11.1.0
- # via label-studio-sdk
-platformdirs==4.3.6
+platformdirs==4.3.7
# via black
pluggy==1.5.0
# via pytest
-propcache==0.3.0
- # via yarl
-pycodestyle==2.12.1
+pycodestyle==2.13.0
# via
# flake8
# flake8-print
-pydantic==2.10.6
- # via
- # -r ./test.in
- # label-studio-sdk
-pydantic-core==2.27.2
+pydantic==2.11.3
+ # via -r ./test.in
+pydantic-core==2.33.1
# via pydantic
-pyflakes==3.2.0
+pyflakes==3.3.2
# via
# autoflake
# flake8
-pyrsistent==0.20.0
- # via jsonschema
pytest==8.3.5
# via
# pytest-cov
# pytest-mock
-pytest-cov==6.0.0
+pytest-cov==6.1.1
# via -r ./test.in
pytest-mock==3.14.0
# via -r ./test.in
@@ -155,35 +82,14 @@ python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# freezegun
- # pandas
-pytz==2025.1
- # via pandas
-pyyaml==6.0.2
- # via vcrpy
-regex==2024.11.6
- # via
- # -c ./base.txt
- # nltk
-requests==2.32.3
- # via
- # -c ./base.txt
- # label-studio-sdk
- # requests-mock
-requests-mock==1.12.1
- # via label-studio-sdk
-ruff==0.11.0
+ruff==0.11.7
# via -r ./test.in
semantic-version==2.10.0
# via liccheck
six==1.17.0
# via
# -c ./base.txt
- # jsonschema
# python-dateutil
-sniffio==1.3.1
- # via
- # -c ./base.txt
- # anyio
toml==0.10.2
# via liccheck
tomli==2.2.1
@@ -193,13 +99,9 @@ tomli==2.2.1
# coverage
# mypy
# pytest
-tqdm==4.67.1
- # via
- # -c ./base.txt
- # nltk
types-click==7.1.8
# via -r ./test.in
-types-markdown==3.7.0.20241204
+types-markdown==3.8.0.20250415
# via -r ./test.in
types-requests==2.31.0.6
# via -r ./test.in
@@ -207,36 +109,13 @@ types-tabulate==0.9.0.20241207
# via -r ./test.in
types-urllib3==1.26.25.14
# via types-requests
-typing-extensions==4.12.2
+typing-extensions==4.13.2
# via
# -c ./base.txt
- # anyio
# black
- # label-studio-sdk
- # multidict
# mypy
# pydantic
# pydantic-core
-tzdata==2025.1
- # via pandas
-ujson==5.10.0
- # via label-studio-sdk
-urllib3==1.26.20
- # via
- # -c ././deps/constraints.txt
- # -c ./base.txt
- # requests
- # vcrpy
-vcrpy==7.0.0
- # via -r ./test.in
-wrapt==1.17.2
- # via
- # -c ./base.txt
- # vcrpy
-xmljson==0.2.1
- # via label-studio-sdk
-yarl==1.18.3
- # via vcrpy
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools
+ # typing-inspection
+typing-inspection==0.4.0
+ # via pydantic
diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh
index 1d0950e923..0e66e05ae4 100755
--- a/scripts/docker-smoke-test.sh
+++ b/scripts/docker-smoke-test.sh
@@ -41,7 +41,7 @@ await_container
docker cp test_unstructured_ingest $CONTAINER_NAME:/app
docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest
docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R notebook-user:notebook-user /app/test_unstructured_ingest"
-docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh"
+docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/local.sh"
result=$?
exit $result
diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh
index 74ea031390..4fb21263a3 100755
--- a/scripts/user/unstructured-get-json.sh
+++ b/scripts/user/unstructured-get-json.sh
@@ -16,12 +16,20 @@ Options:
--hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
--fast fast strategy: No OCR, just extract embedded text
--ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
+ --vlm vlm strategy: Use Vision Language Model for processing
+ --vlm-provider Specify the VLM model provider
+ (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
+ --vlm-model Specify the VLM model when using
+ (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
--tables Enable table extraction: tables are represented as html in metadata
--images Include base64images in json
--coordinates Include coordinates in the output
--trace Enable trace logging for debugging, useful to cut and paste the executed curl call
--verbose Enable verbose logging including printing first 8 elements to stdout
--s3 Write the resulting output to s3 (like a pastebin)
+ --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option.
+ --open-html Automatically open HTML output in browser (macOS only) if --write-html.
+ Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option.
--help Display this help and exit.
@@ -42,8 +50,8 @@ fi
IMAGE_BLOCK_TYPES=${IMAGE_BLOCK_TYPES:-'"image", "table"'}
API_KEY=${UNST_API_KEY:-""}
-TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads"
-TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs"
+TMP_DOWNLOADS_DIR=${UNST_SCRIPT_DOWNLOADS_DIR:-"$HOME/tmp/unst-downloads"}
+TMP_OUTPUTS_DIR=${UNST_SCRIPT_JSON_OUTPUTS_DIR:-"$HOME/tmp/unst-outputs"}
# only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/
S3_URI_PREFIX=${UNST_S3_JSON_OUTPUT_URI:-""}
# e.g. us-east-2, used to provide http links for above location
@@ -64,6 +72,7 @@ copy_to_clipboard() {
HI_RES=false
FAST=false
OCR_ONLY=false
+VLM=false
STRATEGY=""
VERBOSE=false
TRACE=false
@@ -72,6 +81,10 @@ FREEMIUM=false
TABLES=true
IMAGES=false
S3=""
+WRITE_HTML=${UNST_WRITE_HTML:-false}
+OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false}
+VLM_PROVIDER=""
+VLM_MODEL=""
while [[ "$#" -gt 0 ]]; do
case "$1" in
@@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do
OCR_ONLY=true
shift
;;
+ --vlm)
+ VLM=true
+ shift
+ ;;
+ --vlm-provider)
+ if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
+ VLM_PROVIDER=$2
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ --vlm-model)
+ if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
+ VLM_MODEL=$2
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
--trace)
TRACE=true
shift
@@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do
S3=true
shift
;;
+ --write-html)
+ WRITE_HTML=true
+ shift
+ ;;
+ --open-html)
+ OPEN_HTML=true
+ shift
+ ;;
--tables)
TABLES=true
shift
@@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then
exit 1
fi
+# Check for strategy conflicts after all arguments are processed
+STRATEGY_COUNT=0
+$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
+$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
+$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
+$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
+
+if [ "$STRATEGY_COUNT" -gt 1 ]; then
+ echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time."
+ exit 1
+fi
+
+# Check if vlm-provider or vlm-model are provided without --vlm
+if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then
+ echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy."
+ exit 1
+fi
+
if $TRACE; then
set -x
fi
@@ -175,6 +236,25 @@ elif $OCR_ONLY; then
STRATEGY="-ocr-only"
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
CURL_STRATEGY=(-F "strategy=ocr_only")
+elif $VLM; then
+ if $VERBOSE; then echo "Sending API request with vlm strategy"; fi
+ STRATEGY="-vlm"
+ # Add provider and model to filename if specified
+ if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then
+ STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}"
+ elif [ -n "$VLM_PROVIDER" ]; then
+ STRATEGY="-vlm-${VLM_PROVIDER}"
+ elif [ -n "$VLM_MODEL" ]; then
+ STRATEGY="-vlm-model-${VLM_MODEL}"
+ fi
+ JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
+ CURL_STRATEGY=(-F "strategy=vlm")
+ if [ -n "$VLM_PROVIDER" ]; then
+ CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER")
+ fi
+ if [ -n "$VLM_MODEL" ]; then
+ CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL")
+ fi
else
if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
@@ -213,6 +293,44 @@ else
fi
echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}"
+# Convert JSON to HTML if requested
+if [ "$WRITE_HTML" = true ]; then
+ HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html
+
+ if $VLM; then
+ # VLM output has all metadata.text_as_html fields defined, so
+ # create HTML directly from the metadata.text_as_html fields
+ {
+ echo ""
+ echo ""
+ echo ""
+ echo "
"
+ echo "
"
+ echo "
Codestin Search App"
+ echo " "
+ echo ""
+ echo ""
+ jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}"
+ echo ""
+ echo ""
+ } >"${HTML_OUTPUT_FILEPATH}"
+ echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}"
+ else
+ # most elements will not have metadata.text_as_html defined (by design on Table elements do),
+ # so use the unstructured library's python script for the conversion.
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
+ echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}"
+ fi
+
+ # Open HTML file in browser if requested and on macOS
+ if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then
+ open "${HTML_OUTPUT_FILEPATH}"
+ fi
+fi
+
# write .json output to s3 location
if [ -n "$S3" ]; then
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index f63e738a7c..ffaa699cac 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -31,6 +31,7 @@
CompositeElement,
Element,
ElementMetadata,
+ Image,
PageBreak,
Table,
TableChunk,
@@ -234,6 +235,10 @@ def it_accumulates_elements_added_to_it(self):
assert builder._text_length == 112
assert builder._remaining_space == 36
+ def it_will_fit_when_element_has_none_as_text(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ assert builder.will_fit(Image(None))
+
def it_will_fit_an_oversized_element_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions())
assert builder.will_fit(Text("abcd " * 200))
@@ -405,6 +410,12 @@ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self):
pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions())
assert pre_chunk != 42
+ def it_can_handle_element_with_none_as_text(self):
+ pre_chunk = PreChunk(
+ [Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions()
+ )
+ assert pre_chunk._text == "hello"
+
@pytest.mark.parametrize(
("max_characters", "combine_text_under_n_chars", "expected_value"),
[
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index 8376e4440a..ec6c805f34 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -15,6 +15,7 @@
LogCaptureFixture,
Mock,
example_doc_path,
+ input_path,
patch,
property_mock,
)
@@ -30,6 +31,7 @@
is_in_docker = os.path.exists("/.dockerenv")
+
# ================================================================================================
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
# ================================================================================================
@@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
assert predicted_type == FileType.NDJSON
+
+
+def test_office_files_when_document_archive_has_non_standard_prefix():
+
+ predicted_type = detect_filetype(
+ file_path=input_path("file_type/test_document_from_office365.docx")
+ )
+ assert predicted_type == FileType.DOCX
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 6d1145eb80..70eec35fd7 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -823,8 +823,8 @@ def test_partition_categorization_backup():
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=PartitionStrategy.HI_RES,
)
- # Should have changed the element class from Text to Title
- assert isinstance(elements[0], Title)
+ # Should NOT have changed the element class from Text to Title
+ assert isinstance(elements[0], Text)
assert elements[0].text == text
@@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
+
+
+def test_reproductible_pdf_loader():
+ from glob import glob
+
+ for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")):
+ elements_1 = pdf.partition_pdf(
+ filename=f,
+ strategy=PartitionStrategy.AUTO,
+ infer_table_structure=False,
+ )
+ for _ in range(4):
+ elements_2 = pdf.partition_pdf(
+ filename=f,
+ strategy=PartitionStrategy.AUTO,
+ infer_table_structure=False,
+ )
+ for e1, e2 in zip(elements_1, elements_2):
+ assert e1.text == e2.text, f"load two time {f=} return differents results"
+ else:
+ break
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
index bfb09b762a..1be79e92a0 100644
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -73,6 +73,7 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
+ (example_doc_path("img/english-and-korean.png"), True),
],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
index d1d66876ed..94b12d5578 100644
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@@ -141,7 +141,7 @@ def test_partition_msg_can_process_attachments():
"Text",
"Text",
"Image",
- "Title",
+ "Text",
"Text",
"Title",
"Title",
diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py
index 6d3be972b7..11ca79d064 100644
--- a/test_unstructured/staging/test_label_studio.py
+++ b/test_unstructured/staging/test_label_studio.py
@@ -1,11 +1,6 @@
from __future__ import annotations
-import logging
-import re
-
import pytest
-import vcr
-from label_studio_sdk import Client
from test_unstructured.unit_utils import assign_hash_ids
from unstructured.documents.elements import Element, NarrativeText, Title
@@ -17,62 +12,6 @@ def elements():
return [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
-@vcr.use_cassette(
- "test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml",
- allow_playback_repeats=True,
-)
-def test_upload_label_studio_data_with_sdk(
- caplog: pytest.LogCaptureFixture, elements: list[Element]
-):
- """
- Testing Instructions
- ====================
- 1. Remove file `test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml`,
- which will be recreated later.
- 2. Install the label-studio package by running command `pip install -U label-studio`.
- 3. Run command `label-studio`, and login or set up label studio account on pop-up website.
- 4. Update `LABEL_STUDIO_URL` and `API_KEY` below, you can find your API_KEY by
- clicking into your account profile.
- 5. Run this test once, and VCR will record the HTTP request to the yaml file.
- 6. Kill the label studio instance and run the test again, VCR will replay the response.
- """
- log = logging.getLogger("urllib3")
- log.setLevel(logging.DEBUG)
- # Define the URL where Label Studio is accessible
- LABEL_STUDIO_URL = "http://localhost:8080"
- # API_KEY is a temporary key from local install not actually valid anywhere
- # Update it if the vcr cassette is updated with the API key from your user account
- API_KEY = "7b613506d5afa062fe33c9cd825f106c718b82a0"
- # Connect to the Label Studio API and check the connection
- ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)
- ls.check_connection()
- ls.delete_all_projects()
- # Create a sample project to classify types of texts
- project = ls.start_project(
- title="Text Type Classifications",
- label_config="""
-
-
-
-
-
-
-
-
-
-
- """,
- )
- label_studio_data = label_studio.stage_for_label_studio(elements)
- project.import_tasks(label_studio_data)
- # Check success status code (201) for posting tasks job in logger info
- success_posting_tasks_status = re.compile(r"POST /api/projects/.*/import.*201")
- assert bool(success_posting_tasks_status.search(caplog.text))
-
-
def test_convert_to_label_studio_data(elements: list[Element]):
label_studio_data = label_studio.stage_for_label_studio(elements)
diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx
new file mode 100644
index 0000000000..fd9ca065eb
Binary files /dev/null and b/test_unstructured/testfiles/file_type/test_document_from_office365.docx differ
diff --git a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml
deleted file mode 100644
index bf4f22255c..0000000000
--- a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml
+++ /dev/null
@@ -1,414 +0,0 @@
-interactions:
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- User-Agent:
- - python-requests/2.28.0
- method: GET
- uri: http://localhost:8080/api/version
- response:
- body:
- string: '{"release": "1.7.3", "label-studio-os-package": {"version": "1.7.3",
- "short_version": "1.7", "latest_version_from_pypi": "1.7.3", "latest_version_upload_time":
- "2023-04-19T12:05:18", "current_version_is_outdated": false}, "label-studio-os-backend":
- {"message": "Merge pull request #2612 from laggardkernel/bugfix/realpath-in-version
- ...", "commit": "fcd7806529ea60cf5e56c782345ced04659d018d", "date": "2023/02/06
- 20:09:22", "branch": "master", "version": "2.3.12+10.gfcd78065"}, "label-studio-frontend":
- {"message": "fix: LSDV-4692: Brush segmentation is not supported", "commit":
- "f08871a3e70026b12cad502552251db1fba1619e", "branch": "master", "date": "2023/03/29
- 14:40:33"}, "dm2": {"message": "fix: LSDV-4746-1: Only include limited fields
- for project when polling", "commit": "9aa96a97e9bcb4154838249dc721efbc724198b7",
- "branch": "master", "date": "2023/03/13 15:43:21"}, "label-studio-converter":
- {"version": "0.0.51"}}'
- headers:
- Content-Language:
- - en-us
- Content-Length:
- - '924'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:17:59 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA;
- expires=Thu, 15 Jun 2023 21:17:59 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 200
- message: OK
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA
- User-Agent:
- - python-requests/2.28.0
- method: GET
- uri: http://localhost:8080/health
- response:
- body:
- string: '{"status": "UP"}'
- headers:
- Content-Language:
- - en-us
- Content-Length:
- - '16'
- Content-Type:
- - text/html; charset=utf-8
- Date:
- - Thu, 01 Jun 2023 21:18:00 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ;
- expires=Thu, 15 Jun 2023 21:18:00 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 200
- message: OK
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ
- User-Agent:
- - python-requests/2.28.0
- method: GET
- uri: http://localhost:8080/api/projects?page_size=10000000
- response:
- body:
- string: '{"count":1,"next":null,"previous":null,"results":[{"id":23,"title":"Text
- Type Classifications","description":"","label_config":"
\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential
- sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}]}'
- headers:
- Allow:
- - GET, POST, HEAD, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '2033'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:18:01 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU;
- expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 200
- message: OK
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU
- User-Agent:
- - python-requests/2.28.0
- method: GET
- uri: http://localhost:8080/api/projects/23
- response:
- body:
- string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"
\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential
- sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}'
- headers:
- Allow:
- - GET, PUT, PATCH, DELETE, HEAD, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '1981'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:18:01 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU;
- expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 200
- message: OK
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU
- User-Agent:
- - python-requests/2.28.0
- method: GET
- uri: http://localhost:8080/api/projects/23
- response:
- body:
- string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"
\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential
- sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}'
- headers:
- Allow:
- - GET, PUT, PATCH, DELETE, HEAD, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '1981'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:18:01 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU;
- expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 200
- message: OK
-- request:
- body: null
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Content-Length:
- - '0'
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU
- User-Agent:
- - python-requests/2.28.0
- method: DELETE
- uri: http://localhost:8080/api/projects/23/
- response:
- body:
- string: ''
- headers:
- Allow:
- - GET, PUT, PATCH, DELETE, HEAD, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '0'
- Date:
- - Thu, 01 Jun 2023 21:18:01 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU;
- expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 204
- message: No Content
-- request:
- body: '{"title": "Text Type Classifications", "label_config": "\n
\n \n \n \n \n \n \n \n \n \n "}'
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Content-Length:
- - '591'
- Content-Type:
- - application/json
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU
- User-Agent:
- - python-requests/2.28.0
- method: POST
- uri: http://localhost:8080/api/projects
- response:
- body:
- string: '{"id":24,"title":"Text Type Classifications","description":"","label_config":"
\n \n \n \n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T21:18:01.964955Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":null,"task_number":null,"useful_annotation_number":null,"ground_truth_number":null,"skipped_annotations_number":null,"total_annotations_number":null,"total_predictions_number":null,"sampling":"Sequential
- sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":null}'
- headers:
- Allow:
- - GET, POST, HEAD, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '2005'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:18:02 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw;
- expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 201
- message: Created
-- request:
- body: '[{"data": {"text": "Title 1", "ref_id": "ab03af41c2940e7584b62df48a964db3"}},
- {"data": {"text": "Narrative 1", "ref_id": "ff9eb806beb1f483322f6fbda680b08b"}}]'
- headers:
- Accept:
- - '*/*'
- Accept-Encoding:
- - gzip, deflate
- Authorization:
- - Token 7b613506d5afa062fe33c9cd825f106c718b82a0
- Connection:
- - keep-alive
- Content-Length:
- - '158'
- Content-Type:
- - application/json
- Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw
- User-Agent:
- - python-requests/2.28.0
- method: POST
- uri: http://localhost:8080/api/projects/24/import?return_task_ids=1
- response:
- body:
- string: '{"task_count":2,"annotation_count":0,"prediction_count":0,"duration":0.1579442024230957,"file_upload_ids":[],"could_be_tasks_list":false,"found_formats":[],"data_columns":[],"task_ids":[1,2]}'
- headers:
- Allow:
- - POST, OPTIONS
- Content-Language:
- - en-us
- Content-Length:
- - '191'
- Content-Type:
- - application/json
- Date:
- - Thu, 01 Jun 2023 21:18:02 GMT
- Referrer-Policy:
- - same-origin
- Server:
- - WSGIServer/0.2 CPython/3.8.15
- Set-Cookie:
- - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw;
- expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/;
- SameSite=Lax
- Vary:
- - Accept-Language, Cookie, Origin
- X-Content-Type-Options:
- - nosniff
- status:
- code: 201
- message: Created
-version: 1
diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html
index a55cccdbbd..210109c06e 100644
--- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html
@@ -14,9 +14,9 @@
Contents lists available at ScienceDirect
-
+
Data in Brief
-
+
journal homepage: www.elsevier.com/locate/dib
@@ -28,19 +28,19 @@
Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment
![]()
-
+
(Jee
-
+
Omotayo Sanni n, Abimbola Patricia I. Popoola
Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa
-
+
a r t i c l e i n f o
-
+
a b s t r a c t
@@ -88,19 +88,19 @@
Value of the data
-
+
© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel
Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.
-
+
© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316
can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.
-
+
© The data can be used to examine the relationship between the process variable as it affect the
@@ -152,9 +152,9 @@
Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 —0.9393 0.0003 24.0910 2.8163 2 1.9460 0.0596 —0.8276 0.0002 121.440 1.5054 4 0.0163 0.2369 —0.8825 0.0001 42.121 0.9476 6 0.3233 0.0540 —0.8027 5.39E-05 373.180 0.4318 8 0.1240 0.0556 —0.5896 5.46E-05 305.650 0.3772 10 0.0382 0.0086 —0.5356 1.24E-05 246.080 0.0919
-
+
rate (mm/year)
-
+
The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.
@@ -232,12 +232,12 @@
The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the
-
+
ð2Þ
-
-
+
+
ð3Þ
-
+
diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html
index bb95afd2b2..aabc7233cc 100644
--- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html
@@ -14,9 +14,9 @@
Contents lists available at ScienceDirect
-
+
Data in Brief
-
+
journal homepage: www.elsevier.com/locate/dib
@@ -28,9 +28,9 @@
A benchmark dataset for the multiple depot vehicle scheduling problem
-
+
(eee
-
+
Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b
@@ -52,16 +52,16 @@
e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072,
-
+
Australia
-
+
f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India
-
+
a r t i c l e i n f o
-
+
a b s t r a c t
@@ -106,13 +106,13 @@
© The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations.
-
+
e All the problem instances are available for use without any restrictions.
e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison.
-
+
© The dataset includes a program that can generate similar problem instances of different sizes.
@@ -121,9 +121,9 @@
The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm;nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:
-
+
The number of depots mð
-
+
Þ,
@@ -187,9 +187,9 @@
Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 (16, 3000) 1087.20 1101.60 1284.60 2,684,983.60
-
+
Possible empty travels
-
+
diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html
index 0862a71a27..eabce53c29 100644
--- a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html
@@ -76,8 +76,8 @@
Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,
-
+
AQ3
-
+