Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.21.11

### Enhancements
- **Add speech-to-text to multimodal pipeline**: Audio files (WAV, MP3, FLAC, M4A, OGG, OPUS, WEBM, and any format supported by ffmpeg) can now be partitioned into document elements via speech-to-text. Install the optional `audio` extra (`pip install "unstructured[audio]"`) to use the Whisper-based partitioner. Call `partition()` or `partition_audio()` with an audio file to get a transcript as `NarrativeText` elements, each carrying `segment_start_seconds` / `segment_end_seconds` metadata. **Known limitation**: segment timestamps are dropped when elements are merged by a chunking strategy; consume un-chunked elements directly if audio timeline alignment is required.

## 0.21.10
- **Add Form Class**: Adds a new form class in elements.py to deal with forms

Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,12 @@ xlsx = [
"pandas>=2.0.0, <4.0.0",
"xlrd>=2.0.1, <3.0.0",
]
# Speech-to-text for partition_audio (multimodal: audio -> elements)
audio = [
"openai-whisper>=20231117, <20270000",
]
all-docs = [
"unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
"unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
]
# Feature extras
chunking-tokens = [
Expand Down
3 changes: 3 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,14 @@ def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_ty
("expected_value", "file_name", "mime_type"),
[
(FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
(FileType.BMP, "img/bmp_24.bmp", "image/x-bmp"),
(FileType.BMP, "img/bmp_24.bmp", "image/x-ms-bmp"),
(FileType.CSV, "stanley-cups.csv", "text/csv"),
(FileType.CSV, "stanley-cups.csv", "application/csv"),
(FileType.CSV, "stanley-cups.csv", "application/x-csv"),
(FileType.EML, "eml/fake-email.eml", "message/rfc822"),
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
(FileType.HEIC, "img/DA-1p.heic", "image/x-heic"),
(FileType.HTML, "example-10k-1p.html", "text/html"),
(FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"),
Expand Down
19 changes: 9 additions & 10 deletions test_unstructured/file_utils/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def it_can_recognize_a_file_type_from_a_mime_type(
):
assert FileType.from_mime_type(mime_type) is file_type

@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "foo/bar", None])
def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
self, mime_type: str | None
):
Expand All @@ -76,7 +76,7 @@ def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
(FileType.PDF, "pdf"),
(FileType.XLS, "xlsx"),
(FileType.UNK, None),
(FileType.WAV, None),
(FileType.WAV, "audio"),
(FileType.ZIP, None),
],
)
Expand All @@ -98,7 +98,7 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies
(FileType.ODT, ("docx", "pypandoc")),
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
(FileType.UNK, ()),
(FileType.WAV, ()),
(FileType.WAV, ()), # STT agent deps validated at runtime
(FileType.ZIP, ()),
],
)
Expand All @@ -119,7 +119,7 @@ def it_knows_which_importable_packages_its_partitioner_depends_on(
(FileType.JPG, True),
(FileType.PDF, True),
(FileType.PPTX, True),
(FileType.WAV, False),
(FileType.WAV, True),
(FileType.ZIP, False),
(FileType.EMPTY, False),
(FileType.UNK, False),
Expand Down Expand Up @@ -163,14 +163,13 @@ def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
(FileType.JPG, "partition_image"),
(FileType.PNG, "partition_image"),
(FileType.TIFF, "partition_image"),
(FileType.WAV, "partition_audio"),
],
)
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
assert file_type.partitioner_function_name == expected_value

@pytest.mark.parametrize(
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
)
@pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
self, file_type: FileType
):
Expand All @@ -189,16 +188,15 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_
(FileType.JPG, "unstructured.partition.image"),
(FileType.PNG, "unstructured.partition.image"),
(FileType.TIFF, "unstructured.partition.image"),
(FileType.WAV, "unstructured.partition.audio"),
],
)
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
self, file_type: FileType, expected_value: str
):
assert file_type.partitioner_module_qname == expected_value

@pytest.mark.parametrize(
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
)
@pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
self, file_type: FileType
):
Expand All @@ -217,6 +215,7 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p
(FileType.JPG, "image"),
(FileType.PNG, "image"),
(FileType.TIFF, "image"),
(FileType.WAV, "audio"),
(FileType.XLS, "xlsx"),
(FileType.XLSX, "xlsx"),
],
Expand Down
Loading
Loading