From ea9fda8189d9c2452ee7fbdd13d2d09f01eb75b8 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Thu, 12 Jun 2025 02:33:56 -0400 Subject: [PATCH 01/25] passes all but 1 test case --- src/datasets/config.py | 1 + src/datasets/features/video.py | 108 ++++++++++++++++++++------------- tests/features/test_video.py | 44 +++++++------- tests/utils.py | 12 ++++ 4 files changed, 100 insertions(+), 65 deletions(-) diff --git a/src/datasets/config.py b/src/datasets/config.py index 33d86209287..045ad5f92ec 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ IS_MP3_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse( importlib.import_module("soundfile").__libsndfile_version__ ) >= version.parse("1.1.0") +TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index e8c5a0e4456..d34535a55dd 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -1,6 +1,6 @@ import os from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, ClassVar, Optional, TypedDict, Union +from typing import TYPE_CHECKING, Any, ClassVar, Optional, TypedDict, Union, Literal import numpy as np import pyarrow as pa @@ -13,7 +13,8 @@ if TYPE_CHECKING: - from torchvision.io import VideoReader + import torch + from torchcodec.decoders import VideoDecoder from .features import FeatureType @@ -37,7 +38,7 @@ class Video: This is useful for archived files with sequential access. - - A `torchvision.io.VideoReader`: torchvision video reader object. + - A `torchcodec.decoders.VideoDecoder`: torchcodec video decoder object. Args: mode (`str`, *optional*): @@ -54,7 +55,7 @@ class Video: >>> ds.features["video"] Video(decode=True, id=None) >>> ds[0]["video"] - + >>> ds = ds.cast_column('video', Video(decode=False)) {'bytes': None, 'path': 'path/to/Screen Recording.mov'} @@ -63,29 +64,34 @@ class Video: decode: bool = True id: Optional[str] = None + stream_index: Optional[int] = None + dimension_order: Literal['NCHW', 'NHWC'] = 'NCHW' + num_ffmpeg_threads: int = 1 + device: Optional[Union[str, "torch.device"]] = 'cpu' + seek_mode: Literal['exact', 'approximate'] = 'exact' # Automatically constructed - dtype: ClassVar[str] = "torchvision.io.VideoReader" + dtype: ClassVar[str] = "torchcodec.decoders.VideoDecoder" pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) _type: str = field(default="Video", init=False, repr=False) def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray, "VideoReader"]) -> Example: + def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray, "VideoDecoder"]) -> Example: """Encode example into a format for Arrow. Args: - value (`str`, `np.ndarray`, `VideoReader` or `dict`): + value (`str`, `np.ndarray`, `VideoDecoder` or `dict`): Data passed as input to Video feature. Returns: `dict` with "path" and "bytes" fields """ - if config.TORCHVISION_AVAILABLE: - from torchvision.io import VideoReader + if config.TORCHCODEC_AVAILABLE: + from torchcodec.decoders import VideoDecoder else: - VideoReader = None + VideoDecoder = None if isinstance(value, list): value = np.array(value) @@ -97,9 +103,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray elif isinstance(value, np.ndarray): # convert the video array to bytes return encode_np_array(value) - elif VideoReader is not None and isinstance(value, VideoReader): - # convert the torchvision video reader to bytes - return encode_torchvision_video(value) + elif VideoDecoder is not None and isinstance(value, VideoDecoder): + # convert the torchcodec video decoder to bytes + return encode_torchcodec_video(value) elif isinstance(value, dict): path, bytes_ = value.get("path"), value.get("bytes") if path is not None and os.path.isfile(path): @@ -119,7 +125,7 @@ def decode_example( self, value: Union[str, Example], token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, - ) -> "VideoReader": + ) -> "VideoDecoder": """Decode example video file into video data. Args: @@ -135,16 +141,16 @@ def decode_example( a dictionary repo_id (`str`) -> token (`bool` or `str`). Returns: - `torchvision.io.VideoReader` + `torchcodec.decoders.VideoDecoder` """ if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Video(decode=True) instead.") - if config.TORCHVISION_AVAILABLE: - from torchvision.io import VideoReader + if config.TORCHCODEC_AVAILABLE: + from torchcodec.decoders import VideoDecoder else: - raise ImportError("To support decoding videos, please install 'torchvision'.") + raise ImportError("To support decoding videos, please install 'torchcodec'.") if token_per_repo_id is None: token_per_repo_id = {} @@ -158,11 +164,32 @@ def decode_example( if path is None: raise ValueError(f"A video should have one of 'path' or 'bytes' but both are None in {value}.") elif is_local_path(path): - video = VideoReader(path) + video = VideoDecoder( + path, + stream_index = self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device = self.device, + seek_mode = self.seek_mode + ) else: - video = hf_video_reader(path, token_per_repo_id=token_per_repo_id) + video = hf_video_reader( + path, + token_per_repo_id=token_per_repo_id, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device = self.device, + seek_mode = self.seek_mode + ) else: - video = VideoReader(bytes_) + video = VideoDecoder( + bytes_, + stream_index = self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device = self.device, + seek_mode = self.seek_mode + ) video._hf_encoded = {"path": path, "bytes": bytes_} return video @@ -226,17 +253,17 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr return array_cast(storage, self.pa_type) -def video_to_bytes(video: "VideoReader") -> bytes: - """Convert a torchvision Video object to bytes using native compression if possible""" +def video_to_bytes(video: "VideoDecoder") -> bytes: + """Convert a torchcodec Video object to bytes using native compression if possible""" raise NotImplementedError() -def encode_torchvision_video(video: "VideoReader") -> Example: +def encode_torchcodec_video(video: "VideoDecoder") -> Example: if hasattr(video, "_hf_encoded"): return video._hf_encoded else: raise NotImplementedError( - "Encoding a VideoReader that doesn't come from datasets.Video.decode() is not implemented" + "Encoding a VideoDecoder that doesn't come from datasets.Video.decode() is not implemented" ) @@ -244,18 +271,20 @@ def encode_np_array(array: np.ndarray) -> Example: raise NotImplementedError() -# Patching torchvision a little bit to: +# No monkey patch needed! # 1. store the encoded video data {"path": ..., "bytes": ...} in `video._hf_encoded`` # 2. add support for hf:// files -# This doesn't affect the normal usage of torchvision. - def hf_video_reader( - path: str, token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, stream: str = "video" -) -> "VideoReader": - import av - from torchvision import get_video_backend - from torchvision.io import VideoReader + path: str, + token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, + stream: str = "video", + dimension_order: Literal['NCHW', 'NHWC'] = 'NCHW', + num_ffmpeg_threads: int = 1, + device: Optional[Union[str, "torch.device"]] = 'cpu', + seek_mode: Literal['exact', 'approximate'] = 'exact' +) -> "VideoDecoder": + from torchcodec.decoders import VideoDecoder # Load the file from HF if token_per_repo_id is None: @@ -267,14 +296,7 @@ def hf_video_reader( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - # Instantiate the VideoReader - vr = object.__new__(VideoReader) - vr.backend = get_video_backend() - if vr.backend != "pyav": - raise RuntimeError(f"Unsupported video backend for VideoReader from HF files: {vr.backend}") - vr.container = av.open(f, metadata_errors="ignore") - stream_type = stream.split(":")[0] + # Instantiate the VideoDecoder stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1]) - vr.pyav_stream = {stream_type: stream_id} - vr._c = vr.container.decode(**vr.pyav_stream) - return vr + vd = VideoDecoder(f, stream_index=stream_id, dimension_order = dimension_order, num_ffmpeg_threads = num_ffmpeg_threads, device = device, seek_mode = seek_mode) + return vd diff --git a/tests/features/test_video.py b/tests/features/test_video.py index a1008a67a48..c5794be9b66 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -2,10 +2,10 @@ from datasets import Dataset, Features, Video -from ..utils import require_torchvision +from ..utils import require_torchcodec -@require_torchvision +@require_torchcodec @pytest.mark.parametrize( "build_example", [ @@ -19,7 +19,7 @@ ], ) def test_video_feature_encode_example(shared_datadir, build_example): - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") video = Video() @@ -28,13 +28,13 @@ def test_video_feature_encode_example(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = video.decode_example(encoded_example) - assert isinstance(decoded_example, VideoReader) + assert isinstance(decoded_example, VideoDecoder) -@require_torchvision +@require_torchcodec def test_dataset_with_video_feature(shared_datadir): import torch - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path]} @@ -42,20 +42,20 @@ def test_dataset_with_video_feature(shared_datadir): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"video"} - assert isinstance(item["video"], VideoReader) - assert next(item["video"])["data"].shape == (3, 50, 66) - assert isinstance(next(item["video"])["data"], torch.Tensor) + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor) batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"video"} - assert isinstance(batch["video"], list) and all(isinstance(item, VideoReader) for item in batch["video"]) - assert next(batch["video"][0])["data"].shape == (3, 50, 66) - assert isinstance(next(batch["video"][0])["data"], torch.Tensor) + assert isinstance(batch["video"], list) and all(isinstance(item, VideoDecoder) for item in batch["video"]) + assert batch["video"][0].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(batch["video"][0].get_frame_at(0).data, torch.Tensor) column = dset["video"] assert len(column) == 1 - assert isinstance(column, list) and all(isinstance(item, VideoReader) for item in column) - assert next(column[0])["data"].shape == (3, 50, 66) - assert isinstance(next(column[0])["data"], torch.Tensor) + assert isinstance(column, list) and all(isinstance(item, VideoDecoder) for item in column) + assert column[0].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(column[0].get_frame_at(0).data, torch.Tensor) # from bytes with open(video_path, "rb") as f: @@ -63,14 +63,14 @@ def test_dataset_with_video_feature(shared_datadir): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"video"} - assert isinstance(item["video"], VideoReader) - assert next(item["video"])["data"].shape == (3, 50, 66) - assert isinstance(next(item["video"])["data"], torch.Tensor) + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor) -@require_torchvision +@require_torchcodec def test_dataset_with_video_map_and_formatted(shared_datadir): - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path]} @@ -78,7 +78,7 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): dset = Dataset.from_dict(data, features=features) dset = dset.map(lambda x: x).with_format("numpy") example = dset[0] - assert isinstance(example["video"], VideoReader) + assert isinstance(example["video"], VideoDecoder) # assert isinstance(example["video"][0], np.ndarray) # from bytes @@ -87,5 +87,5 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): dset = Dataset.from_dict(data, features=features) dset = dset.map(lambda x: x).with_format("numpy") example = dset[0] - assert isinstance(example["video"], VideoReader) + assert isinstance(example["video"], VideoDecoder) # assert isinstance(example["video"][0], np.ndarray) diff --git a/tests/utils.py b/tests/utils.py index 827404fd13d..391cf6023e2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -189,6 +189,18 @@ def require_torchvision(test_case): test_case = unittest.skip("test requires torchvision")(test_case) return test_case +def require_torchcodec(test_case): + """ + Decorator marking a test that requires torchvision. + + These tests are skipped when torchvision isn't installed. + + """ + if not config.TORCHCODEC_AVAILABLE: + test_case = unittest.skip("test requires torchvision")(test_case) + return test_case + + def require_pdfplumber(test_case): """ From 7be0dcf5a1118d180883ca5997d2d3e3904fe129 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Fri, 13 Jun 2025 01:02:37 -0400 Subject: [PATCH 02/25] Migrated Audio feature to use torchcodec as a backend. Fixed how formatter handles torchcodec objects. Fixed test scripts to work with new Audio backend --- src/datasets/features/audio.py | 73 ++-- src/datasets/formatting/jax_formatter.py | 5 + src/datasets/formatting/np_formatter.py | 5 + src/datasets/formatting/tf_formatter.py | 5 + src/datasets/formatting/torch_formatter.py | 5 + tests/features/test_audio.py | 420 ++++++++++++--------- 6 files changed, 304 insertions(+), 209 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 6992ef62f51..faac684cccd 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -9,12 +9,13 @@ from .. import config from ..download.download_config import DownloadConfig from ..table import array_cast -from ..utils.file_utils import xopen, xsplitext +from ..utils.file_utils import xopen, is_local_path,xsplitext from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: from .features import FeatureType + from torchcodec.decoders import AudioDecoder @dataclass @@ -66,6 +67,7 @@ class Audio: mono: bool = True decode: bool = True id: Optional[str] = None + stream_index: Optional[int] = None # Automatically constructed dtype: ClassVar[str] = "dict" pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) @@ -74,11 +76,11 @@ class Audio: def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, dict]) -> dict: + def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder"]) -> dict: """Encode example into a format for Arrow. Args: - value (`str` or `dict`): + value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`): Data passed as input to Audio feature. Returns: @@ -88,10 +90,22 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict]) -> dict: import soundfile as sf # soundfile is a dependency of librosa, needed to decode audio files. except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + + try: + from torchcodec.decoders import AudioDecoder + except ImportError as err: + raise ImportError("To support encoding audio data, please install 'torchcodec'.") from err + if isinstance(value, str): return {"bytes": None, "path": value} elif isinstance(value, (bytes, bytearray)): return {"bytes": value, "path": None} + elif isinstance(value, AudioDecoder): + samples = value.get_all_samples() + array = samples.data.cpu().numpy().T + buffer = BytesIO() + sf.write(buffer, array, samples.sample_rate, format="wav") + return {"bytes": buffer.getvalue(), "path": None} elif "array" in value: # convert the audio array to wav bytes buffer = BytesIO() @@ -125,7 +139,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict]) -> dict: def decode_example( self, value: dict, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None - ) -> dict: + ) -> "AudioDecoder": """Decode example audio file into audio data. Args: @@ -142,6 +156,11 @@ def decode_example( Returns: `dict` """ + try: + from torchcodec.decoders import AudioDecoder + except ImportError as err: + raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Audio(decode=True) instead.") @@ -149,25 +168,18 @@ def decode_example( if path is None and file is None: raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.") - try: - import librosa - import soundfile as sf - except ImportError as err: - raise ImportError("To support decoding audio files, please install 'librosa' and 'soundfile'.") from err - - audio_format = xsplitext(path)[1][1:].lower() if path is not None else None - if not config.IS_OPUS_SUPPORTED and audio_format == "opus": - raise RuntimeError( - "Decoding 'opus' files requires system library 'libsndfile'>=1.0.31, " - 'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. ' - ) - elif not config.IS_MP3_SUPPORTED and audio_format == "mp3": - raise RuntimeError( - "Decoding 'mp3' files requires system library 'libsndfile'>=1.1.0, " - 'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. ' - ) - - if file is None: + channels = 1 if self.mono else None + if file is None and is_local_path(path): + # print("is_local_path") + # print("stream_index", self.stream_index) + # print("sample_rate", self.sampling_rate) + # print("num_channels", channels) + ad = AudioDecoder(path, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) + # print("ad", ad) + # print("ad.metadata", ad.metadata) + # print("ad.metadata.sample_rate", ad.metadata.sample_rate) + + elif file is None: token_per_repo_id = token_per_repo_id or {} source_url = path.split("::")[-1] pattern = ( @@ -178,19 +190,12 @@ def decode_example( download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: - array, sampling_rate = sf.read(f) + ad = AudioDecoder(f, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) else: - array, sampling_rate = sf.read(file) - - array = array.T - if self.mono: - array = librosa.to_mono(array) - if self.sampling_rate and self.sampling_rate != sampling_rate: - array = librosa.resample(array, orig_sr=sampling_rate, target_sr=self.sampling_rate) - sampling_rate = self.sampling_rate - - return {"path": path, "array": array, "sampling_rate": sampling_rate} + ad = AudioDecoder(file, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) + + return ad def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.""" diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index ad414279ac8..f1a74110ef1 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -111,6 +111,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to jax arrays ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import VideoDecoder, AudioDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? # using global variable since `jaxlib.xla_extension.Device` is not serializable neither # with `pickle` nor with `dill`, so we need to use a global variable instead diff --git a/src/datasets/formatting/np_formatter.py b/src/datasets/formatting/np_formatter.py index c12ecd4f386..83688ac0301 100644 --- a/src/datasets/formatting/np_formatter.py +++ b/src/datasets/formatting/np_formatter.py @@ -68,6 +68,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to np arrays ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import VideoDecoder, AudioDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to np arrays ? return np.asarray(value, **{**default_dtype, **self.np_array_kwargs}) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 964bc4b5589..532b33f4fd0 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -75,6 +75,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to tf tensors ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import VideoDecoder, AudioDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? return tf.convert_to_tensor(value, **{**default_dtype, **self.tf_tensor_kwargs}) diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index cfc3fd9abe8..2831841da70 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -81,6 +81,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to torch tensors ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import VideoDecoder, AudioDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs}) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 695e0502712..04e9b6233af 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -6,10 +6,12 @@ from datasets import Dataset, concatenate_datasets, load_dataset from datasets.features import Audio, Features, Sequence, Value +from itertools import product from ..utils import ( require_librosa, require_sndfile, + require_torchcodec ) @@ -44,6 +46,8 @@ def test_audio_instantiation(): assert audio.sampling_rate is None assert audio.mono is True assert audio.id is None + assert audio.stream_index == None + assert audio.dtype == "dict" assert audio.pa_type == pa.struct({"bytes": pa.binary(), "path": pa.string()}) assert audio._type == "Audio" @@ -58,7 +62,9 @@ def test_audio_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)}) -@require_librosa +# @require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize( "build_example", [ @@ -73,6 +79,7 @@ def test_audio_feature_type_to_arrow(): ], ) def test_audio_feature_encode_example(shared_datadir, build_example): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() encoded_example = audio.encode_example(build_example(audio_path)) @@ -80,10 +87,12 @@ def test_audio_feature_encode_example(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = audio.decode_example(encoded_example) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} + assert isinstance(decoded_example, AudioDecoder) -@require_librosa +# @require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize( "build_example", [ @@ -94,6 +103,7 @@ def test_audio_feature_encode_example(shared_datadir, build_example): ], ) def test_audio_feature_encode_example_pcm(shared_datadir, build_example): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_16000.pcm") audio = Audio(sampling_rate=16_000) encoded_example = audio.encode_example(build_example(audio_path)) @@ -101,124 +111,162 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = audio.decode_example(encoded_example) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} + assert isinstance(decoded_example, AudioDecoder) + + +sample_rates = [16_000, 48_000] +# @require_librosa +@require_torchcodec +@require_sndfile +@pytest.mark.parametrize( + "in_sample_rate,out_sample_rate", + list(product(sample_rates, sample_rates)), +) +def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rate, out_sample_rate): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") + audio = Audio(sampling_rate=out_sample_rate) + example = AudioDecoder(audio_path, sample_rate=in_sample_rate) + encoded_example = audio.encode_example(example) + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["bytes"] is not None or encoded_example["path"] is not None + decoded_example = audio.decode_example(encoded_example) + assert isinstance(decoded_example, AudioDecoder) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_audio_decode_example(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (202311,) - assert decoded_example["sampling_rate"] == 44100 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) with pytest.raises(RuntimeError): Audio(decode=False).decode_example(audio_path) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_audio_resampling(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio(sampling_rate=16000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (73401,) - assert decoded_example["sampling_rate"] == 16000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_audio_decode_example_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.mp3") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (110592,) - assert decoded_example["sampling_rate"] == 44100 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 110592) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_audio_decode_example_opus(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_48000.opus") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (48000,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (1, 48000) -@require_librosa +# @require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize("sampling_rate", [16_000, 48_000]) def test_audio_decode_example_pcm(shared_datadir, sampling_rate): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_16000.pcm") audio_input = {"path": audio_path, "sampling_rate": 16_000} audio = Audio(sampling_rate=sampling_rate) decoded_example = audio.decode_example(audio.encode_example(audio_input)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] is None - assert decoded_example["array"].shape == (16208 * sampling_rate // 16_000,) - assert decoded_example["sampling_rate"] == sampling_rate + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == sampling_rate + assert samples.data.shape == (1, 16208 * sampling_rate // 16_000) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.mp3") audio_path2 = str(shared_datadir / "test_audio_16000.mp3") audio = Audio(sampling_rate=48000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (120373,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (1, 120373) decoded_example = audio.decode_example(audio.encode_example(audio_path2)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path2 - assert decoded_example["array"].shape == (122688,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (1, 122688) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_wav(tar_wav_path): + from torchcodec.decoders import AudioDecoder audio_filename = "test_audio_44100.wav" data = {"audio": []} for file_path, file_obj in iter_archive(tar_wav_path): @@ -228,28 +276,30 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_filename - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_filename - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_filename - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): + from torchcodec.decoders import AudioDecoder audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): @@ -259,25 +309,27 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_filename - assert item["audio"]["array"].shape == (110592,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 110592) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_filename - assert batch["audio"][0]["array"].shape == (110592,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 110592) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_filename - assert column[0]["array"].shape == (110592,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 110592) + +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_with_none(): data = {"audio": [None]} @@ -312,125 +364,135 @@ def test_dataset_with_audio_feature_with_none(): assert item["nested"]["audio"] is None -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (73401,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (73401,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (73401,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (40125,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (40125,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (40125,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] - assert item["audio"]["sampling_rate"] == 44100 + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (73401,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (73401,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (73401,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 73401) -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] - assert item["audio"]["sampling_rate"] == 44100 + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (40125,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (40125,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (40125,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (1, 40124) # (1, 40125) -@require_librosa +@require_torchcodec @pytest.mark.parametrize( "build_data", [ @@ -444,18 +506,19 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) ], ) def test_dataset_cast_to_audio_features(shared_datadir, build_data): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data = build_data(audio_path) dset = Dataset.from_dict(data) item = dset.cast(Features({"audio": Audio()}))[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert isinstance(item["audio"], AudioDecoder) item = dset.cast_column("audio", Audio())[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert isinstance(item["audio"], AudioDecoder) -@require_librosa +# @require_librosa def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -465,11 +528,11 @@ def test_dataset_concatenate_audio_features(shared_datadir): dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) - assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0]["audio"]["array"].shape - assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0]["audio"]["array"].shape + assert concatenated_dataset[0]["audio"].get_all_samples().data.shape == dset1[0]["audio"].get_all_samples().data.shape + assert concatenated_dataset[1]["audio"].get_all_samples().data.shape == dset2[0]["audio"].get_all_samples().data.shape -@require_librosa +# @require_librosa def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -481,16 +544,17 @@ def test_dataset_concatenate_nested_audio_features(shared_datadir): concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert ( - concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape - == dset1[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape + concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape + == dset1[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape ) assert ( - concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"]["array"].shape - == dset2[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape + concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape + == dset2[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape ) @require_sndfile +@require_torchcodec def test_dataset_with_audio_feature_map_is_not_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} @@ -512,8 +576,9 @@ def process_text(example): assert item == {"audio": expected_audio, "text": "Hello World!"} -@require_librosa +# @require_librosa @require_sndfile +@require_torchcodec def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} @@ -521,7 +586,8 @@ def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): dset = Dataset.from_dict(data, features=features) def process_audio_sampling_rate_by_example(example): - example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"] + sample_rate = example["audio"].get_all_samples().sample_rate + example["double_sampling_rate"] = 2 * sample_rate return example decoded_dset = dset.map(process_audio_sampling_rate_by_example) @@ -532,7 +598,7 @@ def process_audio_sampling_rate_by_example(example): def process_audio_sampling_rate_by_batch(batch): double_sampling_rates = [] for audio in batch["audio"]: - double_sampling_rates.append(2 * audio["sampling_rate"]) + double_sampling_rates.append(2 * audio.get_all_samples().sample_rate) batch["double_sampling_rate"] = double_sampling_rates return batch @@ -542,9 +608,11 @@ def process_audio_sampling_rate_by_batch(batch): assert item["double_sampling_rate"] == 88200 -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile def test_formatted_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path, audio_path]} features = Features({"audio": Audio()}) @@ -552,45 +620,45 @@ def test_formatted_dataset_with_audio_feature(shared_datadir): with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) column = dset["audio"] assert len(column) == 2 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["audio"] - assert item["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert item["audio"][0]["path"] == audio_path - assert item["audio"][0]["array"].shape == (202311,) - assert item["audio"][0]["sampling_rate"] == 44100 + assert isinstance(item["audio"][0], AudioDecoder) + samples = item["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["audio"] - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) column = dset["audio"] assert len(column) == 2 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) @pytest.fixture @@ -606,20 +674,22 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory): return path -@require_librosa +# @require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir): + from torchcodec.decoders import AudioDecoder audio_path = str(shared_datadir / "test_audio_44100.wav") data_files = jsonl_audio_dataset_path features = Features({"audio": Audio(), "text": Value("string")}) dset = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) item = dset[0] if not streaming else next(iter(dset)) assert item.keys() == {"audio", "text"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) @require_sndfile From c0d3fce1f3989416470818d646c70875d680f6c2 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Fri, 13 Jun 2025 04:46:33 -0400 Subject: [PATCH 03/25] fixed audio and video features so they now pass the test_dataset_with_audio_feature_map_is_decoded test case. Implemented casting for VideoDecoder and AudioDecoder types --- src/datasets/features/features.py | 9 +++++++++ tests/features/test_video.py | 32 ++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index c1b30133677..76a5af17ff1 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -301,6 +301,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules: import pdfplumber + + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import VideoDecoder, AudioDecoder if isinstance(obj, np.ndarray): if obj.ndim == 0: @@ -438,6 +441,12 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas return list(obj), True else: return obj, False + elif config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules and isinstance(obj, VideoDecoder): + v = Video() + return v.encode_example(obj), True + elif config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules and isinstance(obj, AudioDecoder): + a = Audio() + return a.encode_example(obj), True else: return obj, False diff --git a/tests/features/test_video.py b/tests/features/test_video.py index c5794be9b66..b9e16f92959 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,6 +1,6 @@ import pytest -from datasets import Dataset, Features, Video +from datasets import Dataset, Features, Video, Value from ..utils import require_torchcodec @@ -89,3 +89,33 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): example = dset[0] assert isinstance(example["video"], VideoDecoder) # assert isinstance(example["video"][0], np.ndarray) + + +# Added Test Case +def test_dataset_with_video_feature_map_is_decoded(shared_datadir): + video_path = str(shared_datadir / "test_video_66x50.mov") + data = {"video": [video_path], "text": ["Hello"]} + features = Features({"video": Video(), "text": Value("string")}) + dset = Dataset.from_dict(data, features=features) + + def process_audio_sampling_rate_by_example(example): + begin_stream_seconds = example["video"].metadata.begin_stream_seconds + example["double_begin_stream_seconds"] = 2 * begin_stream_seconds + return example + + decoded_dset = dset.map(process_audio_sampling_rate_by_example) + for item in decoded_dset.cast_column("video", Video(decode=False)): + assert item.keys() == {"video", "text", "double_begin_stream_seconds"} + assert item["double_begin_stream_seconds"] == 0.0 + + def process_audio_sampling_rate_by_batch(batch): + double_fps = [] + for video in batch["video"]: + double_fps.append(2 * video.metadata.begin_stream_seconds) + batch["double_begin_stream_seconds"] = double_fps + return batch + + decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) + for item in decoded_dset.cast_column("video", Video(decode=False)): + assert item.keys() == {"video", "text", "double_begin_stream_seconds"} + assert item["double_begin_stream_seconds"] == 0.0 \ No newline at end of file From 12511a342ad9590b697c9e7b462180531d10d2bc Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Fri, 13 Jun 2025 10:04:11 -0400 Subject: [PATCH 04/25] added load dataset test case to test_video.py --- tests/features/test_video.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/features/test_video.py b/tests/features/test_video.py index b9e16f92959..455ac05ecc0 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,6 +1,6 @@ import pytest -from datasets import Dataset, Features, Video, Value +from datasets import Dataset, Features, Video, Value, Audio, load_dataset from ..utils import require_torchcodec @@ -91,7 +91,7 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): # assert isinstance(example["video"][0], np.ndarray) -# Added Test Case +# Dataset casting and mapping def test_dataset_with_video_feature_map_is_decoded(shared_datadir): video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path], "text": ["Hello"]} @@ -118,4 +118,29 @@ def process_audio_sampling_rate_by_batch(batch): decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) for item in decoded_dset.cast_column("video", Video(decode=False)): assert item.keys() == {"video", "text", "double_begin_stream_seconds"} - assert item["double_begin_stream_seconds"] == 0.0 \ No newline at end of file + assert item["double_begin_stream_seconds"] == 0.0 + +@pytest.fixture +def jsonl_video_dataset_path(shared_datadir, tmp_path_factory): + import json + + video_path = str(shared_datadir / "test_video_66x50.mov") + data = [{"video": video_path, "text": "Hello world!"}] + path = str(tmp_path_factory.mktemp("data") / "video_dataset.jsonl") + with open(path, "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + return path + +@require_torchcodec +@pytest.mark.parametrize("streaming", [False, True]) +def test_load_dataset_with_video_feature(streaming, jsonl_video_dataset_path, shared_datadir): + from torchcodec.decoders import VideoDecoder + video_path = str(shared_datadir / "test_video_66x50.mov") + data_files = jsonl_video_dataset_path + features = Features({"video": Video(), "text": Value("string")}) + dset = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) + item = dset[0] if not streaming else next(iter(dset)) + assert item.keys() == {"video", "text"} + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) \ No newline at end of file From 72f3ade23874450c28df5bd583b11072000a78a9 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Fri, 13 Jun 2025 14:54:00 -0400 Subject: [PATCH 05/25] Modified documentation to document new torchcodec implementation of Video and Audio features. Fixed the the rest of the test files to be compatible with new Audio and Video features. --- docs/source/about_dataset_features.mdx | 19 ++-- docs/source/audio_dataset.mdx | 12 +-- docs/source/audio_load.mdx | 8 +- docs/source/audio_process.mdx | 47 +++++---- docs/source/create_dataset.mdx | 12 +-- docs/source/installation.md | 14 +-- docs/source/process.mdx | 38 +++---- docs/source/quickstart.mdx | 99 +++++++++++-------- docs/source/use_dataset.mdx | 36 +++---- docs/source/video_load.mdx | 42 ++++---- src/datasets/features/audio.py | 18 +--- src/datasets/features/video.py | 19 +++- .../audiofolder/audiofolder.py | 20 +++- tests/packaged_modules/test_audiofolder.py | 6 +- tests/packaged_modules/test_webdataset.py | 8 +- tests/test_formatting.py | 21 ++-- tests/test_upstream_hub.py | 2 +- 17 files changed, 224 insertions(+), 197 deletions(-) diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 30a221e6d6c..ddfeb1852b4 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -53,7 +53,7 @@ See the [flatten](./process#flatten) section to learn how you can extract the ne -The array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`]. +The array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`]. ```py >>> features = Features({'a': Array2D(shape=(1, 3), dtype='int32')}) @@ -69,9 +69,9 @@ The array type also allows the first dimension of the array to be dynamic. This Audio datasets have a column with type [`Audio`], which contains three important fields: -* `array`: the decoded audio data represented as a 1-dimensional array. -* `path`: the path to the downloaded audio file. -* `sampling_rate`: the sampling rate of the audio data. +- `array`: the decoded audio data represented as a 1-dimensional array. +- `path`: the path to the downloaded audio file. +- `sampling_rate`: the sampling rate of the audio data. When you load an audio dataset and call the audio column, the [`Audio`] feature automatically decodes and resamples the audio file: @@ -80,10 +80,7 @@ When you load an audio dataset and call the audio column, the [`Audio`] feature >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") >>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} + ``` @@ -92,7 +89,7 @@ Index into an audio dataset using the row index first and then the `audio` colum -With `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an `array`, +With `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an torchcodec `AudioDecoder` object, ```py >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train").cast_column("audio", Audio(decode=False)) @@ -126,7 +123,7 @@ Index into an image dataset using the row index first and then the `image` colum -With `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`, +With `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`, ```py >>> dataset = load_dataset("AI-Lab-Makerere/beans", split="train").cast_column("image", Image(decode=False)) @@ -146,4 +143,4 @@ You can also define a dataset of images from numpy arrays: And in this case the numpy arrays are encoded into PNG (or TIFF if the pixels values precision is important). For multi-channels arrays like RGB or RGBA, only uint8 is supported. If you use a larger precision, you get a warning and the array is downcasted to uint8. -For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32. +For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32. diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx index 6e6e0b75863..58c4b9f6345 100644 --- a/docs/source/audio_dataset.mdx +++ b/docs/source/audio_dataset.mdx @@ -10,10 +10,9 @@ dataset = load_dataset("/my_dataset") There are several methods for creating and sharing an audio dataset: -* Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. - -* Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files. +- Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. +- Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files. @@ -28,10 +27,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': 'path/to/audio_1', - 'sampling_rate': 16000} + ``` Then upload the dataset to the Hugging Face Hub using [`Dataset.push_to_hub`]: @@ -51,7 +47,6 @@ my_dataset/ ## AudioFolder - The `AudioFolder` is a dataset builder designed to quickly load an audio dataset with several thousand audio files without requiring you to write any code. @@ -101,7 +96,6 @@ If all audio files are contained in a single directory or if they are not on the - If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. ``` diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index c9db4fd5686..6c3352671db 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -8,7 +8,6 @@ Audio decoding is based on the [`soundfile`](https://github.com/bastibe/python-s To work with audio datasets, you need to have the `audio` dependencies installed. Check out the [installation](./installation#audio) guide to learn how to install it. - ## Local files You can load your own dataset using the paths to your audio files. Use the [`~Dataset.cast_column`] function to take a column of audio file paths, and cast it to the [`Audio`] feature: @@ -16,10 +15,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': 'path/to/audio_1', - 'sampling_rate': 16000} + ``` ## AudioFolder @@ -99,7 +95,7 @@ For a guide on how to load any type of dataset, take a look at the general process guide. - ## Cast The [`~Dataset.cast_column`] function is used to cast a column to another feature to be decoded. When you use this function with the [`Audio`] feature, you can resample the sampling rate: @@ -22,16 +21,22 @@ The [`~Dataset.cast_column`] function is used to cast a column to another featur Audio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz: ```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} +>>> ad = dataset[0]["audio"] + +>>> ad = audio_dataset[0]["audio"] +>>> ad.get_all_samples().sample_rate +16000 ```
- - + +
## Map @@ -40,30 +45,30 @@ The [`~Dataset.map`] function helps preprocess your entire dataset at once. Depe - For pretrained speech recognition models, load a feature extractor and tokenizer and combine them in a `processor`: - ```py - >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor + ```py + >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor - >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53" - # after defining a vocab.json file you can instantiate a tokenizer object: - >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") - >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint) - >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer) - ``` + >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53" + # after defining a vocab.json file you can instantiate a tokenizer object: + >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint) + >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer) + ``` - For fine-tuned speech recognition models, you only need to load a `processor`: - ```py - >>> from transformers import AutoProcessor + ```py + >>> from transformers import AutoProcessor - >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") - ``` + >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + ``` When you use [`~Dataset.map`] with your preprocessing function, include the `audio` column to ensure you're actually resampling the audio data: ```py >>> def prepare_dataset(batch): ... audio = batch["audio"] -... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] +... batch["input_values"] = processor(audio.get_all_samples().data, sampling_rate=audio["sampling_rate"]).input_values[0] ... batch["input_length"] = len(batch["input_values"]) ... with processor.as_target_processor(): ... batch["labels"] = processor(batch["sentence"]).input_ids diff --git a/docs/source/create_dataset.mdx b/docs/source/create_dataset.mdx index 7f12b2575c6..0a6d508f9ae 100644 --- a/docs/source/create_dataset.mdx +++ b/docs/source/create_dataset.mdx @@ -4,8 +4,8 @@ Sometimes, you may need to create a dataset if you're working with your own data In this tutorial, you'll learn how to use 🤗 Datasets low-code methods for creating all types of datasets: -* Folder-based builders for quickly creating an image or audio dataset -* `from_` methods for creating datasets from local files +- Folder-based builders for quickly creating an image or audio dataset +- `from_` methods for creating datasets from local files ## File-based builders @@ -24,10 +24,10 @@ To get the list of supported formats and code examples, follow this guide [here] There are two folder-based builders, [`ImageFolder`] and [`AudioFolder`]. These are low-code methods for quickly creating an image or speech and audio dataset with several thousand examples. They are great for rapidly prototyping computer vision and speech models before scaling to a larger dataset. Folder-based builders takes your data and automatically generates the dataset's features, splits, and labels. Under the hood: -* [`ImageFolder`] uses the [`~datasets.Image`] feature to decode an image file. Many image extension formats are supported, such as jpg and png, but other formats are also supported. You can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/imagefolder/imagefolder.py#L39) of supported image extensions. -* [`AudioFolder`] uses the [`~datasets.Audio`] feature to decode an audio file. Audio extensions such as wav and mp3 are supported, and you can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/audiofolder/audiofolder.py#L39) of supported audio extensions. +- [`ImageFolder`] uses the [`~datasets.Image`] feature to decode an image file. Many image extension formats are supported, such as jpg and png, but other formats are also supported. You can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/imagefolder/imagefolder.py#L39) of supported image extensions. +- [`AudioFolder`] uses the [`~datasets.Audio`] feature to decode an audio file. Extensions such as wav, mp3, and even mp4 are supported, and you can check the complete [list](https://ffmpeg.org/ffmpeg-formats.html) of supported audio extensions. Decoding is done via ffmpeg. -The dataset splits are generated from the repository structure, and the label names are automatically inferred from the directory name. +The dataset splits are generated from the repository structure, and the label names are automatically inferred from the directory name. For example, if your image dataset (it is the same for an audio dataset) is stored like this: @@ -44,7 +44,7 @@ pokemon/test/water/wartortle.png Then this is how the folder-based builder generates an example:
- +
Create the image dataset by specifying `imagefolder` in [`load_dataset`]: diff --git a/docs/source/installation.md b/docs/source/installation.md index a6027b2ee5d..c52b72cfc12 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -30,7 +30,7 @@ You should install 🤗 Datasets in a [virtual environment](https://docs.python. ```bash # Activate the virtual environment source .env/bin/activate - + # Deactivate the virtual environment source .env/bin/deactivate ``` @@ -65,18 +65,6 @@ To work with audio datasets, you need to install the [`Audio`] feature as an ext pip install datasets[audio] ``` - - -To decode mp3 files, you need to have at least version 1.1.0 of the `libsndfile` system library. Usually, it's bundled with the python [`soundfile`](https://github.com/bastibe/python-soundfile) package, which is installed as an extra audio dependency for 🤗 Datasets. -For Linux, the required version of `libsndfile` is bundled with `soundfile` starting from version 0.12.0. You can run the following command to determine which version of `libsndfile` is being used by `soundfile`: - -```bash -python -c "import soundfile; print(soundfile.__libsndfile_version__)" -``` - - - - ## Vision To work with image datasets, you need to install the [`Image`] feature as an extra dependency: diff --git a/docs/source/process.mdx b/docs/source/process.mdx index e41767a295f..cb3955325de 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -289,7 +289,7 @@ Notice how the subfields are now their own independent columns: `answers.text` a Some of the more powerful applications of 🤗 Datasets come from using the [`~Dataset.map`] function. The primary purpose of [`~Dataset.map`] is to speed up processing functions. It allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns. -In the following example, prefix each `sentence1` value in the dataset with `'My sentence: '`. +In the following example, prefix each `sentence1` value in the dataset with `'My sentence: '`. Start by creating a function that adds `'My sentence: '` to the beginning of each sentence. The function needs to accept and output a `dict`: @@ -348,18 +348,18 @@ Multiprocessing significantly speeds up processing by parallelizing processes on >>> updated_dataset = dataset.map(lambda example, idx: {"sentence2": f"{idx}: " + example["sentence2"]}, with_indices=True, num_proc=4) ``` -The [`~Dataset.map`] also works with the rank of the process if you set `with_rank=True`. This is analogous to the `with_indices` parameter. The `with_rank` parameter in the mapped function goes after the `index` one if it is already present. +The [`~Dataset.map`] also works with the rank of the process if you set `with_rank=True`. This is analogous to the `with_indices` parameter. The `with_rank` parameter in the mapped function goes after the `index` one if it is already present. ```py >>> import torch >>> from multiprocess import set_start_method ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +>>> from transformers import AutoTokenizer, AutoModelForCausalLM >>> from datasets import load_dataset ->>> +>>> >>> # Get an example dataset >>> dataset = load_dataset("fka/awesome-chatgpt-prompts", split="train") ->>> ->>> # Get an example model and its tokenizer +>>> +>>> # Get an example model and its tokenizer >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat").eval() >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat") >>> @@ -367,7 +367,7 @@ The [`~Dataset.map`] also works with the rank of the process if you set `with_ra ... # Move the model on the right GPU if it's not there already ... device = f"cuda:{(rank or 0) % torch.cuda.device_count()}" ... model.to(device) -... +... ... # Your big GPU call goes here, for example: ... chats = [[ ... {"role": "system", "content": "You are a helpful assistant."}, @@ -395,7 +395,7 @@ The [`~Dataset.map`] also works with the rank of the process if you set `with_ra ... ) ``` -The main use-case for rank is to parallelize computation across several GPUs. This requires setting `multiprocess.set_start_method("spawn")`. If you don't you'll receive the following CUDA error: +The main use-case for rank is to parallelize computation across several GPUs. This requires setting `multiprocess.set_start_method("spawn")`. If you don't you'll receive the following CUDA error: ```bash RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method. @@ -528,7 +528,7 @@ Note the presence of a `Semaphore`: it sets the maximum number of queries that c Let's use it to call the [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model and ask it to return the main topic of each math problem in the [Maxwell-Jia/AIME_2024](https://huggingface.co/Maxwell-Jia/AIME_2024) dataset: -```python +````python >>> from datasets import load_dataset >>> ds = load_dataset("Maxwell-Jia/AIME_2024", split="train") >>> model = "microsoft/Phi-3-mini-4k-instruct" @@ -542,7 +542,7 @@ Let's use it to call the [microsoft/Phi-3-mini-4k-instruct](https://huggingface. 'Solution': 'Denote $\\log_2(x) = a$, $\\log_2(y) = b$, and..., 'Answer': 33, 'Output': 'The main topic is Logarithms.'} -``` +```` Here, [`Dataset.map`] runs many `get_topic` function asynchronously so it doesn't have to wait for every single model response which would take a lot of time to do sequentially. @@ -644,7 +644,7 @@ You can also concatenate two datasets horizontally by setting `axis=1` as long a ### Interleave -You can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as *interleaving*, which is enabled by the [`interleave_datasets`] function. Both [`interleave_datasets`] and [`concatenate_datasets`] work with regular [`Dataset`] and [`IterableDataset`] objects. +You can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as _interleaving_, which is enabled by the [`interleave_datasets`] function. Both [`interleave_datasets`] and [`concatenate_datasets`] work with regular [`Dataset`] and [`IterableDataset`] objects. Refer to the [Stream](./stream#interleave) guide for an example of how to interleave [`IterableDataset`] objects. You can define sampling probabilities for each of the original datasets to specify how to interleave the datasets. @@ -779,9 +779,9 @@ The [`~Dataset.with_transform`] function applies a custom formatting transform o There is also [`~Dataset.set_transform`] which does the same but runs in-place. -You can also use the [`~Dataset.with_transform`] function to decode formats not supported by [`Features`]. For example, the [`Audio`] feature uses [`soundfile`](https://python-soundfile.readthedocs.io/en/0.11.0/) - a fast and simple library to install - but it does not provide support for less common audio formats. Here is where you can use [`~Dataset.set_transform`] to apply a custom decoding transform on the fly. You're free to use any library you like to decode the audio files. +You can also use the [`~Dataset.with_transform`] function for custom decoding on [`Features`]. -The example below uses the [`pydub`](http://pydub.com/) package to open an audio format not supported by `soundfile`: +The example below uses the [`pydub`](http://pydub.com/) package as an alternative to `torchcodec` decoding: ```py >>> import numpy as np @@ -838,12 +838,12 @@ Use the [`~Dataset.save_to_disk`] and [`load_from_disk`] function to reload the 🤗 Datasets supports exporting as well so you can work with your dataset in other applications. The following table shows currently supported file formats you can export to: -| File type | Export method | -|-------------------------|----------------------------------------------------------------| -| CSV | [`Dataset.to_csv`] | -| JSON | [`Dataset.to_json`] | -| Parquet | [`Dataset.to_parquet`] | -| SQL | [`Dataset.to_sql`] | +| File type | Export method | +| ----------------------- | ------------------------------------------------------------------- | +| CSV | [`Dataset.to_csv`] | +| JSON | [`Dataset.to_json`] | +| Parquet | [`Dataset.to_parquet`] | +| SQL | [`Dataset.to_sql`] | | In-memory Python object | [`Dataset.to_pandas`], [`Dataset.to_polars`] or [`Dataset.to_dict`] | For example, export your dataset to a CSV file like this: diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index cf71dee75a0..3dd00027f0e 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -19,20 +19,44 @@ This quickstart is intended for developers who are ready to dive into the code a Each dataset is unique, and depending on the task, some datasets may require additional steps to prepare it for training. But you can always use 🤗 Datasets tools to load and process a dataset. The fastest and easiest way to get started is by loading an existing dataset from the [Hugging Face Hub](https://huggingface.co/datasets). There are thousands of datasets to choose from, spanning many tasks. Choose the type of dataset you want to work with, and let's get started! @@ -49,31 +73,23 @@ pip install datasets 🤗 Datasets also support audio and image data formats: -* To work with audio datasets, install the [`Audio`] feature: +- To work with audio datasets, install the [`Audio`] feature: - ```bash - pip install datasets[audio] - ``` + ```bash + pip install datasets[audio] + ``` -* To work with image datasets, install the [`Image`] feature: +- To work with image datasets, install the [`Image`] feature: - ```bash - pip install datasets[vision] - ``` + ```bash + pip install datasets[vision] + ``` Besides 🤗 Datasets, make sure your preferred machine learning framework is installed: - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - + ```bash pip install torch ``` + ```bash pip install tensorflow ``` ## Audio @@ -102,10 +118,7 @@ Audio datasets are loaded just like text datasets. However, an audio dataset is ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) >>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} + ``` **4**. Create a function to preprocess the audio `array` with the feature extractor, and truncate and pad the sequences into tidy rectangular tensors. The most important thing to remember is to call the audio `array` in the feature extractor since the `array` - the actual speech signal - is the model input. @@ -114,7 +127,7 @@ Once you have a preprocessing function, use the [`~Dataset.map`] function to spe ```py >>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] +... audio_arrays = [x.get_all_samples().data for x in examples["audio"]] ... inputs = feature_extractor( ... audio_arrays, ... sampling_rate=16000, @@ -145,12 +158,13 @@ Use the [`~Dataset.set_format`] function to set the dataset format to `torch` an >>> dataset.set_format(type="torch", columns=["input_values", "labels"]) >>> dataloader = DataLoader(dataset, batch_size=4) ``` + Use the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with TensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` -with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. +with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. ```py >>> import tensorflow as tf @@ -161,6 +175,7 @@ with collation and batching, so one can pass it directly to Keras methods like ` ... shuffle=True, ... ) ``` + @@ -225,12 +240,13 @@ Wrap the dataset in [`torch.utils.data.DataLoader`](https://alband.github.io/doc ... for example in examples: ... images.append((example["pixel_values"])) ... labels.append(example["labels"]) -... +... ... pixel_values = torch.stack(images) ... labels = torch.tensor(labels) ... return {"pixel_values": pixel_values, "labels": labels} >>> dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=4) ``` + @@ -267,6 +283,7 @@ pip install -U albumentations opencv-python ... shuffle=True, ... ) ``` + @@ -335,12 +352,13 @@ Use the [`~Dataset.set_format`] function to set the dataset format to `torch` an >>> dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"]) >>> dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) ``` + Use the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with TensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` -with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. +with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. ```py >>> import tensorflow as tf @@ -351,6 +369,7 @@ with collation and batching, so one can pass it directly to Keras methods like ` ... shuffle=True, ... ) ``` + diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index 073b5261e0d..6b3d83ff48e 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -1,6 +1,6 @@ # Preprocess -In addition to loading datasets, 🤗 Datasets other main goal is to offer a diverse set of preprocessing functions to get a dataset into an appropriate format for training with your machine learning framework. +In addition to loading datasets, 🤗 Datasets other main goal is to offer a diverse set of preprocessing functions to get a dataset into an appropriate format for training with your machine learning framework. There are many possible ways to preprocess a dataset, and it all depends on your specific dataset. Sometimes you may need to rename a column, and other times you might need to unflatten nested fields. 🤗 Datasets provides a way to do most of these things. But in nearly all preprocessing cases, depending on your dataset modality, you'll need to: @@ -20,7 +20,7 @@ Grab a dataset of your choice and follow along! ## Tokenize text -Models cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called *tokens*. Tokens are finally converted to numbers. +Models cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called _tokens_. Tokens are finally converted to numbers. @@ -42,8 +42,8 @@ Check out the [Tokenizers](https://huggingface.co/course/chapter2/4?fw=pt) secti ```py >>> tokenizer(dataset[0]["text"]) -{'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +{'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} ``` @@ -75,6 +75,7 @@ Use the [`~Dataset.set_format`] function to set the dataset format to be compati >>> dataset.format['type'] 'torch' ``` + Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be compatible with TensorFlow. You'll also need to import a [data collator](https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding) from 🤗 Transformers to combine the varying sequence lengths into a single batch of equal lengths: @@ -91,6 +92,7 @@ Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be comp ... shuffle=True ... ) ``` + @@ -98,7 +100,7 @@ Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be comp ## Resample audio signals -Audio inputs like text datasets need to be divided into discrete data points. This is known as *sampling*; the sampling rate tells you how much of the speech signal is captured per second. It is important to make sure the sampling rate of your dataset matches the sampling rate of the data used to pretrain the model you're using. If the sampling rates are different, the pretrained model may perform poorly on your dataset because it doesn't recognize the differences in the sampling rate. +Audio inputs like text datasets need to be divided into discrete data points. This is known as _sampling_; the sampling rate tells you how much of the speech signal is captured per second. It is important to make sure the sampling rate of your dataset matches the sampling rate of the data used to pretrain the model you're using. If the sampling rates are different, the pretrained model may perform poorly on your dataset because it doesn't recognize the differences in the sampling rate. **1**. Start by loading the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset, the [`Audio`] feature, and the feature extractor corresponding to a pretrained [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) model: @@ -113,11 +115,11 @@ Audio inputs like text datasets need to be divided into discrete data points. Th **2**. Index into the first row of the dataset. When you call the `audio` column of the dataset, it is automatically decoded and resampled: ```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} +>>> ad = dataset[0]["audio"] +>>> print(ad) + +>>> ad.get_all_samples().sample_rate +8000 ``` **3**. Reading a dataset card is incredibly useful and can give you a lot of information about the dataset. A quick look at the MInDS-14 dataset card tells you the sampling rate is 8kHz. Likewise, you can get many details about a model from its model card. The Wav2Vec2 model card says it was sampled on 16kHz speech audio. This means you'll need to upsample the MInDS-14 dataset to match the sampling rate of the model. @@ -126,18 +128,18 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} +>>> ad = dataset[0]["audio"] +>>> print(ad) + +>>> ad.get_all_samples().sample_rate +16000 ``` **4**. Use the [`~Dataset.map`] function to resample the entire dataset to 16kHz. This function speeds up resampling by applying the feature extractor to batches of examples instead of individual examples. Set the `batched` parameter to `True`: ```py >>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] +... audio_arrays = [x.get_all_samples().data for x in examples["audio"]] ... inputs = feature_extractor( ... audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True ... ) @@ -150,7 +152,7 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter ## Apply data augmentations -The most common preprocessing you'll do with image datasets is *data augmentation*, a process that introduces random variations to an image without changing the meaning of the data. This can mean changing the color properties of an image or randomly cropping an image. You are free to use any data augmentation library you like, and 🤗 Datasets will help you apply your data augmentations to your dataset. +The most common preprocessing you'll do with image datasets is _data augmentation_, a process that introduces random variations to an image without changing the meaning of the data. This can mean changing the color properties of an image or randomly cropping an image. You are free to use any data augmentation library you like, and 🤗 Datasets will help you apply your data augmentations to your dataset. **1**. Start by loading the [Beans](https://huggingface.co/datasets/beans) dataset, the `Image` feature, and the feature extractor corresponding to a pretrained [ViT](https://huggingface.co/google/vit-base-patch16-224-in21k) model: diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx index e636a2e8c7b..5aad7aa0709 100644 --- a/docs/source/video_load.mdx +++ b/docs/source/video_load.mdx @@ -6,7 +6,7 @@ Video support is experimental and is subject to change. -Video datasets have [`Video`] type columns, which contain `torchvision` objects. +Video datasets have [`Video`] type columns, which contain `torchvision` objects. @@ -21,7 +21,7 @@ When you load a video dataset and call the video column, the videos are decoded >>> dataset = load_dataset("path/to/video/folder", split="train") >>> dataset[0]["video"] - + ``` @@ -38,43 +38,41 @@ Access frames directly from a video using the `VideoReader` using `next()`: ```python >>> video = dataset[0]["video"] ->>> first_frame = next(video) ->>> first_frame["data"].shape +>>> first_frame = video.get_frame_at(0) +>>> first_frame.data.shape (3, 240, 320) ->>> first_frame["pts"] # timestamp +>>> first_frame.pts_seconds # timestamp 0.0 ``` -To get multiple frames at once, you need to iterate on the `VideoReader`. This is the efficient way to obtain a long list of frames: +To get multiple frames at once, you can call `.get_frames_in_range(start: int, stop: int, step: int)`. This will return a frame batch. +This is the efficient way to obtain a long list of frames refer to the [torchcodec docs](https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.VideoDecoder.html) to see more functions for effiently accessing the data: ```python >>> import torch ->>> import itertools ->>> frames = torch.stack([frame["data"] for frame in islice(video, 5)]) ->>> frames.shape -(5, 3, 240, 320) +>>> frames = video.get_frames_in_range(0, 6, 1) +>>> frames.data.shape +torch.Size([5, 3, 240, 320]) ``` -There is also `.seek()` if you need to set the current timestamp of the video: +There is also `.get_frames_played_in_range(start_seconds: float, stop_seconds: float)` to access all frames played whithin a certain time range. ```python ->>> video.get_metadata() -{'video': {'fps': [10.0], 'duration': [16.1]}} ->>> video = video.seek(8.0, keyframes_only=True) ->>> frame = next(video) ->>> first_frame["data"].shape -(3, 240, 320) +>>> frames = video.get_frames_played_in_range(.5, 1.2) +>>> frames.data.shape +torch.Size([42, 3, 240, 320]) ``` ## Local files -You can load a dataset from the video path. Use the [`~Dataset.cast_column`] function to accept a column of video file paths, and decode it into a `torchvision` video with the [`Video`] feature: +You can load a dataset from the video path. Use the [`~Dataset.cast_column`] function to accept a column of video file paths, and decode it into a `torchcodec` video with the [`Video`] feature: + ```py >>> from datasets import Dataset, Video >>> dataset = Dataset.from_dict({"video": ["path/to/video_1", "path/to/video_2", ..., "path/to/video_n"]}).cast_column("video", Video()) >>> dataset[0]["video"] - + ``` If you only want to load the underlying path to the video dataset without decoding the video object, set `decode=False` in the [`Video`] feature: @@ -116,14 +114,14 @@ For local datasets, this is equivalent to passing `videofolder` manually in [`lo >>> dataset = load_dataset("videofolder", data_dir="/path/to/folder") ``` -Then you can access the videos as `torchvision.io.video_reader.VideoReader` objects: +Then you can access the videos as `torchcodec.decoders._video_decoder.VideoDecoder` objects: ``` >>> dataset["train"][0] -{"video": , "label": 0} +{"video": , "label": 0} >>> dataset["train"][-1] -{"video": , "label": 1} +{"video": , "label": 1} ``` To ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]: diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index faac684cccd..7f2d84a669e 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -48,6 +48,8 @@ class Audio: decode (`bool`, defaults to `True`): Whether to decode the audio data. If `False`, returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`. + stream_index (`int`, *optional*): + The streaming index to use from the file. If `None` defaults to the "best" index. Example: @@ -56,10 +58,7 @@ class Audio: >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds[0]["audio"] - {'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} + ``` """ @@ -154,7 +153,7 @@ def decode_example( a dictionary repo_id (`str`) -> token (`bool` or `str`) Returns: - `dict` + `AudioDecoder` """ try: from torchcodec.decoders import AudioDecoder @@ -170,14 +169,7 @@ def decode_example( channels = 1 if self.mono else None if file is None and is_local_path(path): - # print("is_local_path") - # print("stream_index", self.stream_index) - # print("sample_rate", self.sampling_rate) - # print("num_channels", channels) ad = AudioDecoder(path, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) - # print("ad", ad) - # print("ad.metadata", ad.metadata) - # print("ad.metadata.sample_rate", ad.metadata.sample_rate) elif file is None: token_per_repo_id = token_per_repo_id or {} @@ -194,7 +186,7 @@ def decode_example( else: ad = AudioDecoder(file, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) - + ad.metadata.path = path return ad def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index d34535a55dd..38800b50042 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -46,6 +46,21 @@ class Video: decode (`bool`, defaults to `True`): Whether to decode the video data. If `False`, returns the underlying dictionary in the format `{"path": video_path, "bytes": video_bytes}`. + stream_index (`int`, *optional*): + The streaming index to use from the file. If `None` defaults to the "best" index. + dimension_order (`str`, defaults to `NCHW`): + The dimension order of the decoded frames. + where N is the batch size, C is the number of channels, + H is the height, and W is the width of the frames. + num_ffmpeg_threads (`int`, defaults to `1`): + The number of threads to use for decoding the video. (Recommended to keep this at 1) + device (`str` or `torch.device`, defaults to `cpu`): + The device to use for decoding the video. + seek_mode (`str`, defaults to `exact`): + Determines if frame access will be “exact” or “approximate”. + Exact guarantees that requesting frame i will always return frame i, but doing so requires an initial scan of the file. + Approximate is faster as it avoids scanning the file, but less accurate as it uses the file's metadata to calculate where i probably is. + read more [here](https://docs.pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html#sphx-glr-generated-examples-approximate-mode-py) Examples: @@ -55,7 +70,7 @@ class Video: >>> ds.features["video"] Video(decode=True, id=None) >>> ds[0]["video"] - + >>> ds = ds.cast_column('video', Video(decode=False)) {'bytes': None, 'path': 'path/to/Screen Recording.mov'} @@ -81,7 +96,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray """Encode example into a format for Arrow. Args: - value (`str`, `np.ndarray`, `VideoDecoder` or `dict`): + value (`str`, `np.ndarray`, `bytes`, `bytearray`, `VideoDecoder` or `dict`): Data passed as input to Video feature. Returns: diff --git a/src/datasets/packaged_modules/audiofolder/audiofolder.py b/src/datasets/packaged_modules/audiofolder/audiofolder.py index 12ed9efdf04..5df06f7a09e 100644 --- a/src/datasets/packaged_modules/audiofolder/audiofolder.py +++ b/src/datasets/packaged_modules/audiofolder/audiofolder.py @@ -63,5 +63,23 @@ class AudioFolder(folder_based_builder.FolderBasedBuilder): ".xi", ".mp3", ".opus", + ".3gp", + ".3g2", + ".avi", + ".asf", + ".flv", + ".mp4", + ".mov", + ".m4v", + ".mkv", + ".mpg", + ".webm", + ".f4v", + ".wmv", + ".wma", + ".ogg", + ".ogm", + ".mxf", + ".nut", ] -AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS +AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS \ No newline at end of file diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index a7aabb8fdf3..410115da647 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -279,7 +279,7 @@ def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_fi dataset = list(datasets[split]) assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio and metadata - assert len({example["audio"]["path"] for example in dataset}) == expected_num_of_audios + assert len({example["audio"].metadata.path for example in dataset}) == expected_num_of_audios assert len({example["text"] for example in dataset}) == expected_num_of_audios assert all(example["text"] is not None for example in dataset) @@ -298,7 +298,7 @@ def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data dataset = list(datasets[split]) assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio and metadata - assert len({example["audio"]["path"] for example in dataset}) == expected_num_of_audios + assert len({example["audio"].metadata.path for example in dataset}) == expected_num_of_audios assert len({example["text"] for example in dataset}) == expected_num_of_audios assert all(example["text"] is not None for example in dataset) @@ -318,7 +318,7 @@ def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_ assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio (all arrays are different) and metadata assert ( - sum(np.array_equal(dataset[0]["audio"]["array"], example["audio"]["array"]) for example in dataset[1:]) + sum(np.array_equal(dataset[0]["audio"].get_all_samples().data.numpy(), example["audio"].get_all_samples().data.numpy()) for example in dataset[1:]) == 0 ) assert len({example["text"] for example in dataset}) == expected_num_of_audios diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index 128f13022fc..b7b3aa1348f 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -7,7 +7,7 @@ from datasets import Audio, DownloadManager, Features, Image, Sequence, Value from datasets.packaged_modules.webdataset.webdataset import WebDataset -from ..utils import require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, require_torch +from ..utils import require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, require_torch, require_torchcodec @pytest.fixture @@ -158,10 +158,11 @@ def test_image_webdataset_missing_keys(image_wds_file): assert decoded["jpeg"] is None assert decoded["txt"] is None - +@require_torchcodec @require_librosa @require_sndfile def test_audio_webdataset(audio_wds_file): + from torchcodec.decoders import AudioDecoder data_files = {"train": [audio_wds_file]} webdataset = WebDataset(data_files=data_files) split_generators = webdataset._split_generators(DownloadManager()) @@ -187,8 +188,7 @@ def test_audio_webdataset(audio_wds_file): decoded = webdataset.info.features.decode_example(encoded) assert isinstance(decoded["json"], dict) assert isinstance(decoded["json"]["transcript"], str) - assert isinstance(decoded["wav"], dict) - assert isinstance(decoded["wav"]["array"], np.ndarray) + assert isinstance(decoded["wav"], AudioDecoder) def test_webdataset_errors_on_bad_file(bad_wds_file): diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 147822fa8a1..a7194b4b667 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -315,11 +315,11 @@ def test_numpy_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = NumpyFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, np.dtype(np.float32)) + self.assertEqual(row["audio"].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32)) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, np.float32) + self.assertEqual(col[0].get_all_samples().data.cpu().numpy().dtype, np.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, np.dtype(np.float32)) + self.assertEqual(batch["audio"][0].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32)) def test_pandas_formatter(self): pa_table = self._create_dummy_table() @@ -442,11 +442,11 @@ def test_torch_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = TorchFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, torch.float32) + self.assertEqual(row["audio"].get_all_samples().data.dtype, torch.float32) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, torch.float32) + self.assertEqual(col[0].get_all_samples().data.dtype, torch.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, torch.float32) + self.assertEqual(batch["audio"][0].get_all_samples().data.dtype, torch.float32) @require_tf def test_tf_formatter(self): @@ -535,11 +535,14 @@ def test_tf_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = TFFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, tf.float32) + tf_row = tf.convert_to_tensor(row["audio"].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_row.dtype, tf.float32) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, tf.float32) + tf_col_0 = tf.convert_to_tensor(col[0].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_col_0.dtype, tf.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, tf.float32) + tf_batch_0 = tf.convert_to_tensor(batch["audio"][0].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_batch_0.dtype, tf.float32) @require_jax def test_jax_formatter(self): diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index d8350c9e685..92473ad869f 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -402,7 +402,7 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): assert ds.column_names == hub_ds.column_names assert list(ds.features.keys()) == list(hub_ds.features.keys()) assert ds.features == hub_ds.features - np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"]) + np.testing.assert_equal(ds[0]["x"].get_all_samples().data.cpu().numpy(), hub_ds[0]["x"].get_all_samples().data.cpu().numpy()) assert ds[1] == hub_ds[1] # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] From c1843c36a6e5841bdf4683ad71a6cc0aa3b3d870 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Sat, 14 Jun 2025 08:59:50 -0400 Subject: [PATCH 06/25] code formatting for torchcodec changes --- src/datasets/builder.py | 12 +-- src/datasets/features/audio.py | 23 +++-- src/datasets/features/features.py | 4 +- src/datasets/features/video.py | 85 ++++++++++--------- src/datasets/formatting/jax_formatter.py | 2 +- src/datasets/formatting/np_formatter.py | 2 +- src/datasets/formatting/tf_formatter.py | 2 +- src/datasets/formatting/torch_formatter.py | 2 +- .../audiofolder/audiofolder.py | 2 +- src/datasets/utils/extract.py | 6 +- tests/features/test_audio.py | 71 +++++++++++----- tests/features/test_video.py | 12 ++- tests/packaged_modules/test_audiofolder.py | 7 +- tests/packaged_modules/test_webdataset.py | 12 ++- tests/test_hub.py | 18 ++-- tests/test_iterable_dataset.py | 18 ++-- tests/test_upstream_hub.py | 5 +- tests/utils.py | 2 +- 18 files changed, 177 insertions(+), 108 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index acde1ec8af0..1cd4afae733 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -1529,9 +1529,9 @@ def _prepare_split( # the content is the number of examples progress update pbar.update(content) - assert None not in examples_per_job, ( - f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" - ) + assert ( + None not in examples_per_job + ), f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" total_shards = sum(shards_per_job) total_num_examples = sum(examples_per_job) @@ -1784,9 +1784,9 @@ def _prepare_split( # the content is the number of examples progress update pbar.update(content) - assert None not in examples_per_job, ( - f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" - ) + assert ( + None not in examples_per_job + ), f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" total_shards = sum(shards_per_job) total_num_examples = sum(examples_per_job) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 7f2d84a669e..c19e5d3e472 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -9,14 +9,15 @@ from .. import config from ..download.download_config import DownloadConfig from ..table import array_cast -from ..utils.file_utils import xopen, is_local_path,xsplitext +from ..utils.file_utils import is_local_path, xopen from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: - from .features import FeatureType from torchcodec.decoders import AudioDecoder + from .features import FeatureType + @dataclass class Audio: @@ -94,7 +95,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder from torchcodec.decoders import AudioDecoder except ImportError as err: raise ImportError("To support encoding audio data, please install 'torchcodec'.") from err - + if isinstance(value, str): return {"bytes": None, "path": value} elif isinstance(value, (bytes, bytearray)): @@ -159,7 +160,7 @@ def decode_example( from torchcodec.decoders import AudioDecoder except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err - + if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Audio(decode=True) instead.") @@ -169,8 +170,10 @@ def decode_example( channels = 1 if self.mono else None if file is None and is_local_path(path): - ad = AudioDecoder(path, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) - + ad = AudioDecoder( + path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + ) + elif file is None: token_per_repo_id = token_per_repo_id or {} source_url = path.split("::")[-1] @@ -182,10 +185,14 @@ def decode_example( download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: - ad = AudioDecoder(f, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) + ad = AudioDecoder( + f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + ) else: - ad = AudioDecoder(file, stream_index = self.stream_index, sample_rate = self.sampling_rate, num_channels = channels) + ad = AudioDecoder( + file, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + ) ad.metadata.path = path return ad diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 76a5af17ff1..7da158e1f37 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -301,9 +301,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules: import pdfplumber - + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: - from torchcodec.decoders import VideoDecoder, AudioDecoder + from torchcodec.decoders import AudioDecoder, VideoDecoder if isinstance(obj, np.ndarray): if obj.ndim == 0: diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index 38800b50042..8e4aa8fa442 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -1,6 +1,6 @@ import os from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, ClassVar, Optional, TypedDict, Union, Literal +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, TypedDict, Union import numpy as np import pyarrow as pa @@ -48,17 +48,17 @@ class Video: returns the underlying dictionary in the format `{"path": video_path, "bytes": video_bytes}`. stream_index (`int`, *optional*): The streaming index to use from the file. If `None` defaults to the "best" index. - dimension_order (`str`, defaults to `NCHW`): - The dimension order of the decoded frames. - where N is the batch size, C is the number of channels, - H is the height, and W is the width of the frames. + dimension_order (`str`, defaults to `NCHW`): + The dimension order of the decoded frames. + where N is the batch size, C is the number of channels, + H is the height, and W is the width of the frames. num_ffmpeg_threads (`int`, defaults to `1`): The number of threads to use for decoding the video. (Recommended to keep this at 1) device (`str` or `torch.device`, defaults to `cpu`): The device to use for decoding the video. seek_mode (`str`, defaults to `exact`): - Determines if frame access will be “exact” or “approximate”. - Exact guarantees that requesting frame i will always return frame i, but doing so requires an initial scan of the file. + Determines if frame access will be “exact” or “approximate”. + Exact guarantees that requesting frame i will always return frame i, but doing so requires an initial scan of the file. Approximate is faster as it avoids scanning the file, but less accurate as it uses the file's metadata to calculate where i probably is. read more [here](https://docs.pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html#sphx-glr-generated-examples-approximate-mode-py) @@ -80,10 +80,10 @@ class Video: decode: bool = True id: Optional[str] = None stream_index: Optional[int] = None - dimension_order: Literal['NCHW', 'NHWC'] = 'NCHW' + dimension_order: Literal["NCHW", "NHWC"] = "NCHW" num_ffmpeg_threads: int = 1 - device: Optional[Union[str, "torch.device"]] = 'cpu' - seek_mode: Literal['exact', 'approximate'] = 'exact' + device: Optional[Union[str, "torch.device"]] = "cpu" + seek_mode: Literal["exact", "approximate"] = "exact" # Automatically constructed dtype: ClassVar[str] = "torchcodec.decoders.VideoDecoder" pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) @@ -180,32 +180,33 @@ def decode_example( raise ValueError(f"A video should have one of 'path' or 'bytes' but both are None in {value}.") elif is_local_path(path): video = VideoDecoder( - path, - stream_index = self.stream_index, - dimension_order=self.dimension_order, - num_ffmpeg_threads=self.num_ffmpeg_threads, - device = self.device, - seek_mode = self.seek_mode - ) + path, + stream_index=self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) else: video = hf_video_reader( - path, - token_per_repo_id=token_per_repo_id, - dimension_order=self.dimension_order, - num_ffmpeg_threads=self.num_ffmpeg_threads, - device = self.device, - seek_mode = self.seek_mode - ) + path, + token_per_repo_id=token_per_repo_id, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) else: video = VideoDecoder( - bytes_, - stream_index = self.stream_index, - dimension_order=self.dimension_order, - num_ffmpeg_threads=self.num_ffmpeg_threads, - device = self.device, - seek_mode = self.seek_mode - ) + bytes_, + stream_index=self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) video._hf_encoded = {"path": path, "bytes": bytes_} + video.metadata.path = path return video def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: @@ -290,14 +291,15 @@ def encode_np_array(array: np.ndarray) -> Example: # 1. store the encoded video data {"path": ..., "bytes": ...} in `video._hf_encoded`` # 2. add support for hf:// files + def hf_video_reader( - path: str, - token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, + path: str, + token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, stream: str = "video", - dimension_order: Literal['NCHW', 'NHWC'] = 'NCHW', - num_ffmpeg_threads: int = 1, - device: Optional[Union[str, "torch.device"]] = 'cpu', - seek_mode: Literal['exact', 'approximate'] = 'exact' + dimension_order: Literal["NCHW", "NHWC"] = "NCHW", + num_ffmpeg_threads: int = 1, + device: Optional[Union[str, "torch.device"]] = "cpu", + seek_mode: Literal["exact", "approximate"] = "exact", ) -> "VideoDecoder": from torchcodec.decoders import VideoDecoder @@ -313,5 +315,12 @@ def hf_video_reader( # Instantiate the VideoDecoder stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1]) - vd = VideoDecoder(f, stream_index=stream_id, dimension_order = dimension_order, num_ffmpeg_threads = num_ffmpeg_threads, device = device, seek_mode = seek_mode) + vd = VideoDecoder( + f, + stream_index=stream_id, + dimension_order=dimension_order, + num_ffmpeg_threads=num_ffmpeg_threads, + device=device, + seek_mode=seek_mode, + ) return vd diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index f1a74110ef1..c52ef7a4d59 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -112,7 +112,7 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to jax arrays ? if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: - from torchcodec.decoders import VideoDecoder, AudioDecoder + from torchcodec.decoders import AudioDecoder, VideoDecoder if isinstance(value, (VideoDecoder, AudioDecoder)): return value # TODO(QL): set output to jax arrays ? diff --git a/src/datasets/formatting/np_formatter.py b/src/datasets/formatting/np_formatter.py index 83688ac0301..062d199c6f6 100644 --- a/src/datasets/formatting/np_formatter.py +++ b/src/datasets/formatting/np_formatter.py @@ -69,7 +69,7 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to np arrays ? if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: - from torchcodec.decoders import VideoDecoder, AudioDecoder + from torchcodec.decoders import AudioDecoder, VideoDecoder if isinstance(value, (VideoDecoder, AudioDecoder)): return value # TODO(QL): set output to np arrays ? diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 532b33f4fd0..1a20eb31d1d 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -76,7 +76,7 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to tf tensors ? if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: - from torchcodec.decoders import VideoDecoder, AudioDecoder + from torchcodec.decoders import AudioDecoder, VideoDecoder if isinstance(value, (VideoDecoder, AudioDecoder)): return value # TODO(QL): set output to jax arrays ? diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index 2831841da70..3501f9368be 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -82,7 +82,7 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to torch tensors ? if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: - from torchcodec.decoders import VideoDecoder, AudioDecoder + from torchcodec.decoders import AudioDecoder, VideoDecoder if isinstance(value, (VideoDecoder, AudioDecoder)): return value # TODO(QL): set output to jax arrays ? diff --git a/src/datasets/packaged_modules/audiofolder/audiofolder.py b/src/datasets/packaged_modules/audiofolder/audiofolder.py index 5df06f7a09e..96f5e9d3c8a 100644 --- a/src/datasets/packaged_modules/audiofolder/audiofolder.py +++ b/src/datasets/packaged_modules/audiofolder/audiofolder.py @@ -82,4 +82,4 @@ class AudioFolder(folder_based_builder.FolderBasedBuilder): ".mxf", ".nut", ] -AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS \ No newline at end of file +AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 1e87c617217..161c7ba49b5 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -52,11 +52,13 @@ def extract(self, input_path: str, force_extract: bool = False) -> str: class BaseExtractor(ABC): @classmethod @abstractmethod - def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: ... + def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: + ... @staticmethod @abstractmethod - def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: ... + def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: + ... class MagicNumberBaseExtractor(BaseExtractor, ABC): diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 04e9b6233af..7593b990205 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -1,18 +1,14 @@ import os import tarfile +from itertools import product import pyarrow as pa import pytest from datasets import Dataset, concatenate_datasets, load_dataset from datasets.features import Audio, Features, Sequence, Value -from itertools import product -from ..utils import ( - require_librosa, - require_sndfile, - require_torchcodec -) +from ..utils import require_sndfile, require_torchcodec @pytest.fixture() @@ -46,8 +42,8 @@ def test_audio_instantiation(): assert audio.sampling_rate is None assert audio.mono is True assert audio.id is None - assert audio.stream_index == None - + assert audio.stream_index is None + assert audio.dtype == "dict" assert audio.pa_type == pa.struct({"bytes": pa.binary(), "path": pa.string()}) assert audio._type == "Audio" @@ -80,6 +76,7 @@ def test_audio_feature_type_to_arrow(): ) def test_audio_feature_encode_example(shared_datadir, build_example): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() encoded_example = audio.encode_example(build_example(audio_path)) @@ -104,6 +101,7 @@ def test_audio_feature_encode_example(shared_datadir, build_example): ) def test_audio_feature_encode_example_pcm(shared_datadir, build_example): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_16000.pcm") audio = Audio(sampling_rate=16_000) encoded_example = audio.encode_example(build_example(audio_path)) @@ -115,6 +113,8 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): sample_rates = [16_000, 48_000] + + # @require_librosa @require_torchcodec @require_sndfile @@ -124,6 +124,7 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): ) def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rate, out_sample_rate): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio(sampling_rate=out_sample_rate) example = AudioDecoder(audio_path, sample_rate=in_sample_rate) @@ -132,7 +133,7 @@ def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rat assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = audio.decode_example(encoded_example) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) # @require_librosa @@ -140,6 +141,7 @@ def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rat @require_sndfile def test_audio_decode_example(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) @@ -157,6 +159,7 @@ def test_audio_decode_example(shared_datadir): @require_sndfile def test_audio_resampling(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio(sampling_rate=16000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) @@ -171,10 +174,11 @@ def test_audio_resampling(shared_datadir): @require_sndfile def test_audio_decode_example_mp3(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 110592) @@ -185,10 +189,11 @@ def test_audio_decode_example_mp3(shared_datadir): @require_sndfile def test_audio_decode_example_opus(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_48000.opus") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 48000 assert samples.data.shape == (1, 48000) @@ -200,11 +205,12 @@ def test_audio_decode_example_opus(shared_datadir): @pytest.mark.parametrize("sampling_rate", [16_000, 48_000]) def test_audio_decode_example_pcm(shared_datadir, sampling_rate): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_16000.pcm") audio_input = {"path": audio_path, "sampling_rate": 16_000} audio = Audio(sampling_rate=sampling_rate) decoded_example = audio.decode_example(audio.encode_example(audio_input)) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == sampling_rate assert samples.data.shape == (1, 16208 * sampling_rate // 16_000) @@ -215,18 +221,19 @@ def test_audio_decode_example_pcm(shared_datadir, sampling_rate): @require_sndfile def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") audio_path2 = str(shared_datadir / "test_audio_16000.mp3") audio = Audio(sampling_rate=48000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 48000 assert samples.data.shape == (1, 120373) decoded_example = audio.decode_example(audio.encode_example(audio_path2)) - assert isinstance(decoded_example, AudioDecoder) + assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 48000 assert samples.data.shape == (1, 122688) @@ -237,6 +244,7 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): @require_sndfile def test_dataset_with_audio_feature(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) @@ -267,6 +275,7 @@ def test_dataset_with_audio_feature(shared_datadir): @require_sndfile def test_dataset_with_audio_feature_tar_wav(tar_wav_path): from torchcodec.decoders import AudioDecoder + audio_filename = "test_audio_44100.wav" data = {"audio": []} for file_path, file_obj in iter_archive(tar_wav_path): @@ -280,6 +289,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 202311) + assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 @@ -287,6 +297,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 202311) + assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) @@ -300,6 +311,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): @require_sndfile def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): from torchcodec.decoders import AudioDecoder + audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): @@ -313,6 +325,7 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 110592) + assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 @@ -320,6 +333,7 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 110592) + assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) @@ -328,7 +342,6 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): assert samples.data.shape == (1, 110592) - @require_torchcodec @require_sndfile def test_dataset_with_audio_feature_with_none(): @@ -369,6 +382,7 @@ def test_dataset_with_audio_feature_with_none(): @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) @@ -399,6 +413,7 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) @@ -408,20 +423,20 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) # @require_librosa @@ -429,6 +444,7 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) @@ -463,6 +479,7 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) @@ -476,20 +493,20 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (1, 40124) # (1, 40125) @require_torchcodec @@ -507,6 +524,7 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) ) def test_dataset_cast_to_audio_features(shared_datadir, build_data): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = build_data(audio_path) dset = Dataset.from_dict(data) @@ -528,8 +546,12 @@ def test_dataset_concatenate_audio_features(shared_datadir): dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) - assert concatenated_dataset[0]["audio"].get_all_samples().data.shape == dset1[0]["audio"].get_all_samples().data.shape - assert concatenated_dataset[1]["audio"].get_all_samples().data.shape == dset2[0]["audio"].get_all_samples().data.shape + assert ( + concatenated_dataset[0]["audio"].get_all_samples().data.shape == dset1[0]["audio"].get_all_samples().data.shape + ) + assert ( + concatenated_dataset[1]["audio"].get_all_samples().data.shape == dset2[0]["audio"].get_all_samples().data.shape + ) # @require_librosa @@ -613,6 +635,7 @@ def process_audio_sampling_rate_by_batch(batch): @require_sndfile def test_formatted_dataset_with_audio_feature(shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path, audio_path]} features = Features({"audio": Audio()}) @@ -680,6 +703,7 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory): @pytest.mark.parametrize("streaming", [False, True]) def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir): from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data_files = jsonl_audio_dataset_path features = Features({"audio": Audio(), "text": Value("string")}) @@ -690,6 +714,7 @@ def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, sh samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 assert samples.data.shape == (1, 202311) + assert item["audio"].metadata.path == audio_path @require_sndfile diff --git a/tests/features/test_video.py b/tests/features/test_video.py index 455ac05ecc0..9cccbee27d8 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,6 +1,6 @@ import pytest -from datasets import Dataset, Features, Video, Value, Audio, load_dataset +from datasets import Dataset, Features, Value, Video, load_dataset from ..utils import require_torchcodec @@ -106,7 +106,7 @@ def process_audio_sampling_rate_by_example(example): decoded_dset = dset.map(process_audio_sampling_rate_by_example) for item in decoded_dset.cast_column("video", Video(decode=False)): assert item.keys() == {"video", "text", "double_begin_stream_seconds"} - assert item["double_begin_stream_seconds"] == 0.0 + assert item["double_begin_stream_seconds"] == 0.0 def process_audio_sampling_rate_by_batch(batch): double_fps = [] @@ -118,7 +118,8 @@ def process_audio_sampling_rate_by_batch(batch): decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) for item in decoded_dset.cast_column("video", Video(decode=False)): assert item.keys() == {"video", "text", "double_begin_stream_seconds"} - assert item["double_begin_stream_seconds"] == 0.0 + assert item["double_begin_stream_seconds"] == 0.0 + @pytest.fixture def jsonl_video_dataset_path(shared_datadir, tmp_path_factory): @@ -132,10 +133,12 @@ def jsonl_video_dataset_path(shared_datadir, tmp_path_factory): f.write(json.dumps(item) + "\n") return path + @require_torchcodec @pytest.mark.parametrize("streaming", [False, True]) def test_load_dataset_with_video_feature(streaming, jsonl_video_dataset_path, shared_datadir): from torchcodec.decoders import VideoDecoder + video_path = str(shared_datadir / "test_video_66x50.mov") data_files = jsonl_video_dataset_path features = Features({"video": Video(), "text": Value("string")}) @@ -143,4 +146,5 @@ def test_load_dataset_with_video_feature(streaming, jsonl_video_dataset_path, sh item = dset[0] if not streaming else next(iter(dset)) assert item.keys() == {"video", "text"} assert isinstance(item["video"], VideoDecoder) - assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) \ No newline at end of file + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert item["video"].metadata.path == video_path diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index 410115da647..3e7c1c61617 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -318,7 +318,12 @@ def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_ assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio (all arrays are different) and metadata assert ( - sum(np.array_equal(dataset[0]["audio"].get_all_samples().data.numpy(), example["audio"].get_all_samples().data.numpy()) for example in dataset[1:]) + sum( + np.array_equal( + dataset[0]["audio"].get_all_samples().data.numpy(), example["audio"].get_all_samples().data.numpy() + ) + for example in dataset[1:] + ) == 0 ) assert len({example["text"] for example in dataset}) == expected_num_of_audios diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index b7b3aa1348f..92cd763770a 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -1,13 +1,19 @@ import json import tarfile -import numpy as np import pytest from datasets import Audio, DownloadManager, Features, Image, Sequence, Value from datasets.packaged_modules.webdataset.webdataset import WebDataset -from ..utils import require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, require_torch, require_torchcodec +from ..utils import ( + require_librosa, + require_numpy1_on_windows, + require_pil, + require_sndfile, + require_torch, + require_torchcodec, +) @pytest.fixture @@ -158,11 +164,13 @@ def test_image_webdataset_missing_keys(image_wds_file): assert decoded["jpeg"] is None assert decoded["txt"] is None + @require_torchcodec @require_librosa @require_sndfile def test_audio_webdataset(audio_wds_file): from torchcodec.decoders import AudioDecoder + data_files = {"train": [audio_wds_file]} webdataset = WebDataset(data_files=data_files) split_generators = webdataset._split_generators(DownloadManager()) diff --git a/tests/test_hub.py b/tests/test_hub.py index fac361ba3f4..3685812a45c 100644 --- a/tests/test_hub.py +++ b/tests/test_hub.py @@ -12,7 +12,8 @@ from datasets.utils.hub import hf_dataset_url -DUMMY_DATASET_SCRIPT = dedent("""\ +DUMMY_DATASET_SCRIPT = dedent( + """\ import datasets @@ -34,7 +35,8 @@ def _split_generators(self, dl_manager): def _generate_examples(self): for key in range(5): yield key, {"text": f"{self.config.name}-{key}"} -""") +""" +) @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"]) @@ -64,7 +66,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf ) hf_api.upload_file( token=hf_token, - path_or_fileobj=dedent(f"""\ + path_or_fileobj=dedent( + f"""\ --- {METADATA_CONFIGS_FIELD}: - config_name: cats @@ -76,7 +79,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf - split: train path: dogs/train/* --- - """).encode(), + """ + ).encode(), path_in_repo="README.md", repo_id=repo_id, repo_type="dataset", @@ -93,7 +97,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf CommitOperationDelete(path_in_repo="dogs/train/0000.csv", is_folder=False), CommitOperationAdd( path_in_repo="README.md", - path_or_fileobj=dedent(f"""\ + path_or_fileobj=dedent( + f"""\ --- {METADATA_CONFIGS_FIELD}: - config_name: cats @@ -101,7 +106,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf - split: train path: cats/train/* --- - """).encode(), + """ + ).encode(), ), ] assert mock_method.call_args.kwargs.get("operations") == expected_operations diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 855903fd8c2..07b43f0d98a 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -1206,9 +1206,9 @@ def test_skip_examples_iterable(): skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[count:] assert list(skip_ex_iterable) == expected - assert skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable, ( - "skip examples makes the shards order fixed" - ) + assert ( + skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable + ), "skip examples makes the shards order fixed" assert_load_state_dict_resumes_iteration(skip_ex_iterable) @@ -1218,9 +1218,9 @@ def test_take_examples_iterable(): take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[:count] assert list(take_ex_iterable) == expected - assert take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable, ( - "skip examples makes the shards order fixed" - ) + assert ( + take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable + ), "skip examples makes the shards order fixed" assert_load_state_dict_resumes_iteration(take_ex_iterable) @@ -1290,9 +1290,9 @@ def test_horizontally_concatenated_examples_iterable(): concatenated_ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2]) expected = [{**x, **y} for (_, x), (_, y) in zip(ex_iterable1, ex_iterable2)] assert [x for _, x in concatenated_ex_iterable] == expected - assert concatenated_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is concatenated_ex_iterable, ( - "horizontally concatenated examples makes the shards order fixed" - ) + assert ( + concatenated_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is concatenated_ex_iterable + ), "horizontally concatenated examples makes the shards order fixed" assert_load_state_dict_resumes_iteration(concatenated_ex_iterable) diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 92473ad869f..144df538c33 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -402,7 +402,10 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): assert ds.column_names == hub_ds.column_names assert list(ds.features.keys()) == list(hub_ds.features.keys()) assert ds.features == hub_ds.features - np.testing.assert_equal(ds[0]["x"].get_all_samples().data.cpu().numpy(), hub_ds[0]["x"].get_all_samples().data.cpu().numpy()) + np.testing.assert_equal( + ds[0]["x"].get_all_samples().data.cpu().numpy(), + hub_ds[0]["x"].get_all_samples().data.cpu().numpy(), + ) assert ds[1] == hub_ds[1] # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] diff --git a/tests/utils.py b/tests/utils.py index 391cf6023e2..6aa08a75e2d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -189,6 +189,7 @@ def require_torchvision(test_case): test_case = unittest.skip("test requires torchvision")(test_case) return test_case + def require_torchcodec(test_case): """ Decorator marking a test that requires torchvision. @@ -201,7 +202,6 @@ def require_torchcodec(test_case): return test_case - def require_pdfplumber(test_case): """ Decorator marking a test that requires pdfplumber. From e8b68e5683b68e0d5b4277b3c855a444bec44869 Mon Sep 17 00:00:00 2001 From: Ty Todd <49127578+TyTodd@users.noreply.github.com> Date: Tue, 17 Jun 2025 10:37:08 -0400 Subject: [PATCH 07/25] Update src/datasets/features/audio.py Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- src/datasets/features/audio.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 22282a14471..d74b98d6ed5 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -184,7 +184,10 @@ def decode_example( token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None download_config = DownloadConfig(token=token) - with xopen(path, "rb", download_config=download_config) as f: + f = xopen(path, "rb", download_config=download_config) + ad = AudioDecoder( + f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + ) ad = AudioDecoder( f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) From e9a4a14cfc0463aa29469ff557de0bace7a50438 Mon Sep 17 00:00:00 2001 From: Ty Todd Date: Tue, 17 Jun 2025 15:05:18 -0400 Subject: [PATCH 08/25] added backwards compatibility support and _hf_encoded for Audio feature. --- docs/source/about_dataset_features.mdx | 2 +- docs/source/audio_dataset.mdx | 2 +- docs/source/audio_load.mdx | 2 +- docs/source/audio_process.mdx | 2 +- docs/source/quickstart.mdx | 2 +- docs/source/use_dataset.mdx | 4 +- src/datasets/features/audio.py | 81 +++++++++++++++++--------- src/datasets/features/video.py | 4 +- tests/features/test_audio.py | 29 +++++++++ tests/features/test_video.py | 8 +-- 10 files changed, 96 insertions(+), 40 deletions(-) diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 4e5b182b7ed..4c2e7d62fe3 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -80,7 +80,7 @@ When you load an audio dataset and call the audio column, the [`Audio`] feature >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") >>> dataset[0]["audio"] - +.AudioDecoder object at 0x11642b6a0> ``` diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx index 58c4b9f6345..4f1f9031fd8 100644 --- a/docs/source/audio_dataset.mdx +++ b/docs/source/audio_dataset.mdx @@ -27,7 +27,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] - +.AudioDecoder object at 0x11642b6a0> ``` Then upload the dataset to the Hugging Face Hub using [`Dataset.push_to_hub`]: diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index 6c3352671db..195e7e78620 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -15,7 +15,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] - +.AudioDecoder object at 0x11642b6a0> ``` ## AudioFolder diff --git a/docs/source/audio_process.mdx b/docs/source/audio_process.mdx index cf91395b835..4fe62c6a03c 100644 --- a/docs/source/audio_process.mdx +++ b/docs/source/audio_process.mdx @@ -22,7 +22,7 @@ Audio files are decoded and resampled on-the-fly, so the next time you access an ```py >>> ad = dataset[0]["audio"] - +.AudioDecoder object at 0x11642b6a0> >>> ad = audio_dataset[0]["audio"] >>> ad.get_all_samples().sample_rate 16000 diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 5bdae6b491f..779ffa291bd 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -118,7 +118,7 @@ Audio datasets are loaded just like text datasets. However, an audio dataset is ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) >>> dataset[0]["audio"] - +.AudioDecoder object at 0x11642b6a0> ``` **4**. Create a function to preprocess the audio `array` with the feature extractor, and truncate and pad the sequences into tidy rectangular tensors. The most important thing to remember is to call the audio `array` in the feature extractor since the `array` - the actual speech signal - is the model input. diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index 6b3d83ff48e..23b0db4d88e 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -117,7 +117,7 @@ Audio inputs like text datasets need to be divided into discrete data points. Th ```py >>> ad = dataset[0]["audio"] >>> print(ad) - +.AudioDecoder object at 0x11642b6a0> >>> ad.get_all_samples().sample_rate 8000 ``` @@ -130,7 +130,7 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) >>> ad = dataset[0]["audio"] >>> print(ad) - +.AudioDecoder object at 0x11642b6a0> >>> ad.get_all_samples().sample_rate 16000 ``` diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index d74b98d6ed5..6fc594accae 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -13,11 +13,28 @@ from ..utils.py_utils import no_op_if_value_is_null, string_to_dict +def AudioDecoderClsGenerator(): + import torchcodec.decoders as tcodec + + class AudioDecoder( + tcodec.AudioDecoder + ): # NOTE: array and sampling_rate are loaded each call. Maybe better to cache this in the future + def __getitem__(self, key: str): + if key == "array": + return self.get_all_samples().data + elif key == "sampling_rate": + return self.get_all_samples().sample_rate + + return AudioDecoder + + if TYPE_CHECKING: - from torchcodec.decoders import AudioDecoder + import torchcodec.decoders as tcodec from .features import FeatureType + AudioDecoder = AudioDecoderClsGenerator() + @dataclass class Audio: @@ -59,7 +76,7 @@ class Audio: >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds[0]["audio"] - + .AudioDecoder object at 0x11642b6a0> ``` """ @@ -76,7 +93,7 @@ class Audio: def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder"]) -> dict: + def encode_example(self, value: Union[str, bytes, bytearray, dict, "tcodec.AudioDecoder", "AudioDecoder"]) -> dict: """Encode example into a format for Arrow. Args: @@ -91,21 +108,22 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err - try: - from torchcodec.decoders import AudioDecoder - except ImportError as err: - raise ImportError("To support encoding audio data, please install 'torchcodec'.") from err + if value is None: + raise ValueError("value must be provided") + + if config.TORCHCODEC_AVAILABLE: + import torchcodec.decoders as tcodec + + AudioDecoder = AudioDecoderClsGenerator() + else: + AudioDecoder = None if isinstance(value, str): return {"bytes": None, "path": value} elif isinstance(value, (bytes, bytearray)): return {"bytes": value, "path": None} - elif isinstance(value, AudioDecoder): - samples = value.get_all_samples() - array = samples.data.cpu().numpy().T - buffer = BytesIO() - sf.write(buffer, array, samples.sample_rate, format="wav") - return {"bytes": buffer.getvalue(), "path": None} + elif AudioDecoder is not None and isinstance(value, (AudioDecoder, tcodec.AudioDecoder)): + return encode_torchcodec_audio(value, sf) elif "array" in value: # convert the audio array to wav bytes buffer = BytesIO() @@ -156,25 +174,25 @@ def decode_example( Returns: `AudioDecoder` """ - try: - from torchcodec.decoders import AudioDecoder - except ImportError as err: - raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + if config.TORCHCODEC_AVAILABLE: + AudioDecoder = AudioDecoderClsGenerator() + else: + raise ImportError("To support decoding audio data, please install 'torchcodec'.") if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Audio(decode=True) instead.") - path, file = (value["path"], BytesIO(value["bytes"])) if value["bytes"] is not None else (value["path"], None) - if path is None and file is None: + path, bytes = (value["path"], BytesIO(value["bytes"])) if value["bytes"] is not None else (value["path"], None) + if path is None and bytes is None: raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.") channels = 1 if self.mono else None - if file is None and is_local_path(path): + if bytes is None and is_local_path(path): ad = AudioDecoder( path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) - elif file is None: + elif bytes is None: token_per_repo_id = token_per_repo_id or {} source_url = path.split("::")[-1] pattern = ( @@ -185,17 +203,13 @@ def decode_example( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - ad = AudioDecoder( - f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels - ) - ad = AudioDecoder( - f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels - ) + ad = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels) else: ad = AudioDecoder( - file, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) + ad._hf_encoded = {"path": path, "bytes": bytes} ad.metadata.path = path return ad @@ -287,3 +301,14 @@ def path_to_bytes(path): ) storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) return array_cast(storage, self.pa_type) + + +def encode_torchcodec_audio(audio: "AudioDecoder", sf: Any) -> dict: + if hasattr(audio, "_hf_encoded"): + return audio._hf_encoded + else: + samples = audio.get_all_samples() + array = samples.data.cpu().numpy().T + buffer = BytesIO() + sf.write(buffer, array, samples.sample_rate, format="wav") + return {"bytes": buffer.getvalue(), "path": None} diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index b5a697b448c..2bad95fbbf2 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -102,9 +102,11 @@ def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray Returns: `dict` with "path" and "bytes" fields """ + if value is None: + raise ValueError("value must be provided") + if config.TORCHCODEC_AVAILABLE: from torchcodec.decoders import VideoDecoder - else: VideoDecoder = None diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 3dabbeeced5..4e74c0ce502 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -178,6 +178,7 @@ def test_audio_decode_example_mp3(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) + print("decoded_example", decoded_example) assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 44100 @@ -239,6 +240,34 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): assert samples.data.shape == (1, 122688) +@require_torchcodec +@require_sndfile +def test_backwards_compatibility(shared_datadir): + from torchcodec.decoders import AudioDecoder + + audio_path = str(shared_datadir / "test_audio_44100.mp3") + audio_path2 = str(shared_datadir / "test_audio_16000.mp3") + audio = Audio(sampling_rate=48000) + + decoded_example = audio.decode_example(audio.encode_example(audio_path)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert decoded_example["sampling_rate"] == samples.sample_rate + assert ( + decoded_example["array"].shape[0] == samples.data.shape[0] + and abs(decoded_example["array"].shape[1] - samples.data.shape[1]) < 2 + ) # can have off by one error + + decoded_example = audio.decode_example(audio.encode_example(audio_path2)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert decoded_example["sampling_rate"] == samples.sample_rate + assert ( + decoded_example["array"].shape[0] == samples.data.shape[0] + and abs(decoded_example["array"].shape[1] - samples.data.shape[1]) < 2 + ) # can have off by one error + + # @require_librosa @require_torchcodec @require_sndfile diff --git a/tests/features/test_video.py b/tests/features/test_video.py index 842ac32d39c..1acaf253bd4 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,6 +1,6 @@ import pytest -from datasets import Column, Dataset, Features, Video +from datasets import Column, Dataset, Features, Value, Video, load_dataset from ..utils import require_torchcodec @@ -53,10 +53,10 @@ def test_dataset_with_video_feature(shared_datadir): assert isinstance(batch["video"][0].get_frame_at(0).data, torch.Tensor) column = dset["video"] assert len(column) == 1 - + assert isinstance(column, Column) and all(isinstance(item, VideoDecoder) for item in column) - assert next(column[0]).get_frame_at(0).data.shape == (3, 50, 66) - assert isinstance(next(column[0]).get_frame_at(0).data, torch.Tensor) + assert next(iter(column)).get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(next(iter(column)).get_frame_at(0).data, torch.Tensor) # from bytes with open(video_path, "rb") as f: From 6c0e4256d712c06d96829ae847cc5e934ce7239f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 18 Jun 2025 17:53:12 +0200 Subject: [PATCH 09/25] move AudioDecoder to its own file --- src/datasets/builder.py | 12 ++++----- src/datasets/features/_torchcodec.py | 13 ++++++++++ src/datasets/features/audio.py | 39 ++++++++++------------------ src/datasets/utils/extract.py | 6 ++--- tests/test_iterable_dataset.py | 18 ++++++------- 5 files changed, 43 insertions(+), 45 deletions(-) create mode 100644 src/datasets/features/_torchcodec.py diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 1cd4afae733..acde1ec8af0 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -1529,9 +1529,9 @@ def _prepare_split( # the content is the number of examples progress update pbar.update(content) - assert ( - None not in examples_per_job - ), f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + assert None not in examples_per_job, ( + f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + ) total_shards = sum(shards_per_job) total_num_examples = sum(examples_per_job) @@ -1784,9 +1784,9 @@ def _prepare_split( # the content is the number of examples progress update pbar.update(content) - assert ( - None not in examples_per_job - ), f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + assert None not in examples_per_job, ( + f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + ) total_shards = sum(shards_per_job) total_num_examples = sum(examples_per_job) diff --git a/src/datasets/features/_torchcodec.py b/src/datasets/features/_torchcodec.py new file mode 100644 index 00000000000..fb45563a199 --- /dev/null +++ b/src/datasets/features/_torchcodec.py @@ -0,0 +1,13 @@ +from torchcodec.decoders import AudioDecoder as _AudioDecoder + + +class AudioDecoder(_AudioDecoder): + def __getitem__(self, key: str): + if key == "array": + return self.get_all_samples().data + elif key == "sampling_rate": + return self.get_samples_played_in_range(0, 0).sample_rate + elif hasattr(self, "__getitem__"): + return super().__getitem__(key) + else: + raise TypeError("'torchcodec.decoders.AudioDecoder' object is not subscriptable") diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 6fc594accae..44ce85ce0c7 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -13,28 +13,11 @@ from ..utils.py_utils import no_op_if_value_is_null, string_to_dict -def AudioDecoderClsGenerator(): - import torchcodec.decoders as tcodec - - class AudioDecoder( - tcodec.AudioDecoder - ): # NOTE: array and sampling_rate are loaded each call. Maybe better to cache this in the future - def __getitem__(self, key: str): - if key == "array": - return self.get_all_samples().data - elif key == "sampling_rate": - return self.get_all_samples().sample_rate - - return AudioDecoder - - if TYPE_CHECKING: - import torchcodec.decoders as tcodec + from torchcodec.decoders import AudioDecoder from .features import FeatureType - AudioDecoder = AudioDecoderClsGenerator() - @dataclass class Audio: @@ -76,7 +59,7 @@ class Audio: >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds[0]["audio"] - .AudioDecoder object at 0x11642b6a0> + ``` """ @@ -93,7 +76,7 @@ class Audio: def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, dict, "tcodec.AudioDecoder", "AudioDecoder"]) -> dict: + def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder"]) -> dict: """Encode example into a format for Arrow. Args: @@ -112,9 +95,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "tcodec.Audio raise ValueError("value must be provided") if config.TORCHCODEC_AVAILABLE: - import torchcodec.decoders as tcodec + from torchcodec.decoders import AudioDecoder - AudioDecoder = AudioDecoderClsGenerator() else: AudioDecoder = None @@ -122,8 +104,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "tcodec.Audio return {"bytes": None, "path": value} elif isinstance(value, (bytes, bytearray)): return {"bytes": value, "path": None} - elif AudioDecoder is not None and isinstance(value, (AudioDecoder, tcodec.AudioDecoder)): - return encode_torchcodec_audio(value, sf) + elif AudioDecoder is not None and isinstance(value, AudioDecoder): + return encode_torchcodec_audio(value) elif "array" in value: # convert the audio array to wav bytes buffer = BytesIO() @@ -175,7 +157,7 @@ def decode_example( `AudioDecoder` """ if config.TORCHCODEC_AVAILABLE: - AudioDecoder = AudioDecoderClsGenerator() + from ._torchcodec import AudioDecoder else: raise ImportError("To support decoding audio data, please install 'torchcodec'.") @@ -303,10 +285,15 @@ def path_to_bytes(path): return array_cast(storage, self.pa_type) -def encode_torchcodec_audio(audio: "AudioDecoder", sf: Any) -> dict: +def encode_torchcodec_audio(audio: "AudioDecoder") -> dict: if hasattr(audio, "_hf_encoded"): return audio._hf_encoded else: + try: + import soundfile as sf # soundfile is a dependency of librosa, needed to decode audio files. + except ImportError as err: + raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + samples = audio.get_all_samples() array = samples.data.cpu().numpy().T buffer = BytesIO() diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 161c7ba49b5..1e87c617217 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -52,13 +52,11 @@ def extract(self, input_path: str, force_extract: bool = False) -> str: class BaseExtractor(ABC): @classmethod @abstractmethod - def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: - ... + def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: ... @staticmethod @abstractmethod - def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: - ... + def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: ... class MagicNumberBaseExtractor(BaseExtractor, ABC): diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 07b43f0d98a..855903fd8c2 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -1206,9 +1206,9 @@ def test_skip_examples_iterable(): skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[count:] assert list(skip_ex_iterable) == expected - assert ( - skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable - ), "skip examples makes the shards order fixed" + assert skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable, ( + "skip examples makes the shards order fixed" + ) assert_load_state_dict_resumes_iteration(skip_ex_iterable) @@ -1218,9 +1218,9 @@ def test_take_examples_iterable(): take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count) expected = list(generate_examples_fn(n=total))[:count] assert list(take_ex_iterable) == expected - assert ( - take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable - ), "skip examples makes the shards order fixed" + assert take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable, ( + "skip examples makes the shards order fixed" + ) assert_load_state_dict_resumes_iteration(take_ex_iterable) @@ -1290,9 +1290,9 @@ def test_horizontally_concatenated_examples_iterable(): concatenated_ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2]) expected = [{**x, **y} for (_, x), (_, y) in zip(ex_iterable1, ex_iterable2)] assert [x for _, x in concatenated_ex_iterable] == expected - assert ( - concatenated_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is concatenated_ex_iterable - ), "horizontally concatenated examples makes the shards order fixed" + assert concatenated_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is concatenated_ex_iterable, ( + "horizontally concatenated examples makes the shards order fixed" + ) assert_load_state_dict_resumes_iteration(concatenated_ex_iterable) From e74a9eecb01e1a89e22b35a20c5734f76345a162 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 18 Jun 2025 17:56:08 +0200 Subject: [PATCH 10/25] naming --- docs/source/about_dataset_features.mdx | 2 +- docs/source/audio_dataset.mdx | 2 +- docs/source/audio_load.mdx | 2 +- docs/source/audio_process.mdx | 8 ++++---- docs/source/quickstart.mdx | 2 +- docs/source/use_dataset.mdx | 14 +++++++------- src/datasets/features/audio.py | 12 ++++++------ 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 4c2e7d62fe3..d575e28065d 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -80,7 +80,7 @@ When you load an audio dataset and call the audio column, the [`Audio`] feature >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") >>> dataset[0]["audio"] -.AudioDecoder object at 0x11642b6a0> + ``` diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx index 4f1f9031fd8..8419a70f698 100644 --- a/docs/source/audio_dataset.mdx +++ b/docs/source/audio_dataset.mdx @@ -27,7 +27,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -.AudioDecoder object at 0x11642b6a0> + ``` Then upload the dataset to the Hugging Face Hub using [`Dataset.push_to_hub`]: diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index 195e7e78620..0321a82e624 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -15,7 +15,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -.AudioDecoder object at 0x11642b6a0> + ``` ## AudioFolder diff --git a/docs/source/audio_process.mdx b/docs/source/audio_process.mdx index 4fe62c6a03c..3ae8fe941d3 100644 --- a/docs/source/audio_process.mdx +++ b/docs/source/audio_process.mdx @@ -21,10 +21,10 @@ The [`~Dataset.cast_column`] function is used to cast a column to another featur Audio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz: ```py ->>> ad = dataset[0]["audio"] -.AudioDecoder object at 0x11642b6a0> ->>> ad = audio_dataset[0]["audio"] ->>> ad.get_all_samples().sample_rate +>>> audio = dataset[0]["audio"] + +>>> audio = audio_dataset[0]["audio"] +>>> audio.get_all_samples().sample_rate 16000 ``` diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 779ffa291bd..092940de95c 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -118,7 +118,7 @@ Audio datasets are loaded just like text datasets. However, an audio dataset is ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) >>> dataset[0]["audio"] -.AudioDecoder object at 0x11642b6a0> + ``` **4**. Create a function to preprocess the audio `array` with the feature extractor, and truncate and pad the sequences into tidy rectangular tensors. The most important thing to remember is to call the audio `array` in the feature extractor since the `array` - the actual speech signal - is the model input. diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index 23b0db4d88e..5d08275041e 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -115,10 +115,10 @@ Audio inputs like text datasets need to be divided into discrete data points. Th **2**. Index into the first row of the dataset. When you call the `audio` column of the dataset, it is automatically decoded and resampled: ```py ->>> ad = dataset[0]["audio"] +>>> audio = dataset[0]["audio"] >>> print(ad) -.AudioDecoder object at 0x11642b6a0> ->>> ad.get_all_samples().sample_rate + +>>> audio.get_all_samples().sample_rate 8000 ``` @@ -128,10 +128,10 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) ->>> ad = dataset[0]["audio"] ->>> print(ad) -.AudioDecoder object at 0x11642b6a0> ->>> ad.get_all_samples().sample_rate +>>> audio = dataset[0]["audio"] +>>> print(audio) + +>>> audio.get_all_samples().sample_rate 16000 ``` diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 44ce85ce0c7..8675b4e75e5 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -170,7 +170,7 @@ def decode_example( channels = 1 if self.mono else None if bytes is None and is_local_path(path): - ad = AudioDecoder( + audio = AudioDecoder( path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) @@ -185,15 +185,15 @@ def decode_example( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - ad = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels) + audio = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels) else: - ad = AudioDecoder( + audio = AudioDecoder( bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) - ad._hf_encoded = {"path": path, "bytes": bytes} - ad.metadata.path = path - return ad + audio._hf_encoded = {"path": path, "bytes": bytes} + audio.metadata.path = path + return audio def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.""" From 28e017376127d9a97ffccdebc5ec3affe6ca42b7 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 18 Jun 2025 18:26:57 +0200 Subject: [PATCH 11/25] docs --- docs/source/audio_process.mdx | 6 +++++- docs/source/use_dataset.mdx | 2 +- src/datasets/features/audio.py | 23 +++++++++++++++++------ src/datasets/features/image.py | 4 +++- src/datasets/features/video.py | 16 +++++++++++++--- 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/docs/source/audio_process.mdx b/docs/source/audio_process.mdx index 3ae8fe941d3..e822da6381a 100644 --- a/docs/source/audio_process.mdx +++ b/docs/source/audio_process.mdx @@ -24,7 +24,11 @@ Audio files are decoded and resampled on-the-fly, so the next time you access an >>> audio = dataset[0]["audio"] >>> audio = audio_dataset[0]["audio"] ->>> audio.get_all_samples().sample_rate +>>> samples = audio.get_all_samples() +>>> samples.data +tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3447e-06, + -1.9127e-04, -5.3330e-05]] +>>> samples.sample_rate 16000 ``` diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index 5d08275041e..aaaf9e1cfdf 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -116,7 +116,7 @@ Audio inputs like text datasets need to be divided into discrete data points. Th ```py >>> audio = dataset[0]["audio"] ->>> print(ad) +>>> print(audio) >>> audio.get_all_samples().sample_rate 8000 diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 8675b4e75e5..da195d77bed 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -30,15 +30,19 @@ class Audio: - `path`: String with relative path of the audio file to the archive file. - `bytes`: Bytes content of the audio file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed audio files. - A `dict` with the keys: - - `path`: String with relative path of the audio file to the archive file. - `array`: Array containing the audio sample - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample. - This is useful for archived files with sequential access. + - A `torchcodec.decoders.AudioDecoder`: torchcodec audio decoder object. + + Output: The Audio features output data as `torchcodec.decoders.AudioDecoder` objects, with additional keys: + + - `array`: Array containing the audio sample + - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample. Args: sampling_rate (`int`, *optional*): @@ -57,9 +61,16 @@ class Audio: ```py >>> from datasets import load_dataset, Audio >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") - >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) + >>> ds = ds.cast_column("audio", Audio(sampling_rate=44100)) >>> ds[0]["audio"] - + + >>> audio = ds[0]["audio"] + >>> audio.get_samples_played_in_range(0, 10) + AudioSamples: + data (shape): torch.Size([2, 110592]) + pts_seconds: 0.0 + duration_seconds: 2.507755102040816 + sample_rate: 44100 ``` """ @@ -154,7 +165,7 @@ def decode_example( a dictionary repo_id (`str`) -> token (`bool` or `str`) Returns: - `AudioDecoder` + `torchcodec.decoders.AudioDecoder` """ if config.TORCHCODEC_AVAILABLE: from ._torchcodec import AudioDecoder diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index 79794beb12f..ad2e6bdfaec 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -53,11 +53,13 @@ class Image: - `path`: String with relative path of the image file to the archive file. - `bytes`: Bytes of the image file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed image files. - An `np.ndarray`: NumPy array representing an image. - A `PIL.Image.Image`: PIL image object. + Output: The Image features output data as `PIL.Image.Image` objects. + Args: mode (`str`, *optional*): The mode to convert the image to. If `None`, the native mode of the image is used. diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index 2bad95fbbf2..6b4bdc97a6c 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -27,7 +27,7 @@ class Example(TypedDict): @dataclass class Video: """ - **Experimental.** Video [`Feature`] to read video data from a video file. + Video [`Feature`] to read video data from a video file. Input: The Video feature accepts as input: - A `str`: Absolute path to the video file (i.e. random access is allowed). @@ -36,10 +36,12 @@ class Video: - `path`: String with relative path of the video file in a dataset repository. - `bytes`: Bytes of the video file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed video files. - A `torchcodec.decoders.VideoDecoder`: torchcodec video decoder object. + Output: The Video features output data as `torchcodec.decoders.VideoDecoder` objects. + Args: mode (`str`, *optional*): The mode to convert the video to. If `None`, the native mode of the video is used. @@ -71,7 +73,15 @@ class Video: Video(decode=True, id=None) >>> ds[0]["video"] - >>> ds = ds.cast_column('video', Video(decode=False)) + >>> video = ds[0]["video"] + >>> video.get_frames_in_range(0, 10) + FrameBatch: + data (shape): torch.Size([10, 3, 50, 66]) + pts_seconds: tensor([0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, + 0.4333], dtype=torch.float64) + duration_seconds: tensor([0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, + 0.0167], dtype=torch.float64) + >>> ds.cast_column('video', Video(decode=False))[0]["video] {'bytes': None, 'path': 'path/to/Screen Recording.mov'} ``` From c50c505e49070140de16702744ef3c0609d8b6d1 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 18 Jun 2025 18:28:15 +0200 Subject: [PATCH 12/25] style --- src/datasets/features/audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index da195d77bed..559d483bf88 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -196,7 +196,9 @@ def decode_example( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - audio = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels) + audio = AudioDecoder( + f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels + ) else: audio = AudioDecoder( From 806a4ba6f9e6803e52f0eb1dbea56345ef883141 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 16:18:11 +0200 Subject: [PATCH 13/25] update tests --- setup.py | 4 +--- src/datasets/features/_torchcodec.py | 4 ++-- src/datasets/features/audio.py | 4 ++-- tests/features/test_audio.py | 21 --------------------- tests/packaged_modules/test_audiofolder.py | 13 ++++++------- tests/packaged_modules/test_webdataset.py | 2 -- tests/test_formatting.py | 8 ++++---- tests/test_upstream_hub.py | 4 ++-- tests/utils.py | 1 - 9 files changed, 17 insertions(+), 44 deletions(-) diff --git a/setup.py b/setup.py index 64f266eee16..b1553e14797 100644 --- a/setup.py +++ b/setup.py @@ -137,8 +137,7 @@ AUDIO_REQUIRE = [ "soundfile>=0.12.1", - "librosa", - "soxr>=0.4.0", # Supports numpy-2 + "torchcodec>=0.4.0", ] VISION_REQUIRE = [ @@ -195,7 +194,6 @@ NUMPY2_INCOMPATIBLE_LIBRARIES = [ "faiss-cpu", - "librosa", # librosa -> numba-0.60.0 requires numpy < 2.1 (see GH-7111) "tensorflow", ] TESTS_NUMPY2_REQUIRE = [ diff --git a/src/datasets/features/_torchcodec.py b/src/datasets/features/_torchcodec.py index fb45563a199..daf6b686f64 100644 --- a/src/datasets/features/_torchcodec.py +++ b/src/datasets/features/_torchcodec.py @@ -4,10 +4,10 @@ class AudioDecoder(_AudioDecoder): def __getitem__(self, key: str): if key == "array": - return self.get_all_samples().data + return self.get_all_samples().data.cpu().numpy() elif key == "sampling_rate": return self.get_samples_played_in_range(0, 0).sample_rate - elif hasattr(self, "__getitem__"): + elif hasattr(super(), "__getitem__"): return super().__getitem__(key) else: raise TypeError("'torchcodec.decoders.AudioDecoder' object is not subscriptable") diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 559d483bf88..9fb2120abd2 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -98,7 +98,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder `dict` """ try: - import soundfile as sf # soundfile is a dependency of librosa, needed to decode audio files. + import soundfile as sf # needed to write audio files except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err @@ -303,7 +303,7 @@ def encode_torchcodec_audio(audio: "AudioDecoder") -> dict: return audio._hf_encoded else: try: - import soundfile as sf # soundfile is a dependency of librosa, needed to decode audio files. + import soundfile as sf # needed to write audio files except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 4e74c0ce502..17c8610124b 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -58,7 +58,6 @@ def test_audio_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)}) -# @require_librosa @require_torchcodec @require_sndfile @pytest.mark.parametrize( @@ -87,7 +86,6 @@ def test_audio_feature_encode_example(shared_datadir, build_example): assert isinstance(decoded_example, AudioDecoder) -# @require_librosa @require_torchcodec @require_sndfile @pytest.mark.parametrize( @@ -115,7 +113,6 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): sample_rates = [16_000, 48_000] -# @require_librosa @require_torchcodec @require_sndfile @pytest.mark.parametrize( @@ -136,7 +133,6 @@ def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rat assert isinstance(decoded_example, AudioDecoder) -# @require_librosa @require_torchcodec @require_sndfile def test_audio_decode_example(shared_datadir): @@ -154,7 +150,6 @@ def test_audio_decode_example(shared_datadir): Audio(decode=False).decode_example(audio_path) -# @require_librosa @require_torchcodec @require_sndfile def test_audio_resampling(shared_datadir): @@ -169,7 +164,6 @@ def test_audio_resampling(shared_datadir): assert samples.data.shape == (1, 73401) -# @require_librosa @require_torchcodec @require_sndfile def test_audio_decode_example_mp3(shared_datadir): @@ -185,7 +179,6 @@ def test_audio_decode_example_mp3(shared_datadir): assert samples.data.shape == (1, 110592) -# @require_librosa @require_torchcodec @require_sndfile def test_audio_decode_example_opus(shared_datadir): @@ -200,7 +193,6 @@ def test_audio_decode_example_opus(shared_datadir): assert samples.data.shape == (1, 48000) -# @require_librosa @require_torchcodec @require_sndfile @pytest.mark.parametrize("sampling_rate", [16_000, 48_000]) @@ -217,7 +209,6 @@ def test_audio_decode_example_pcm(shared_datadir, sampling_rate): assert samples.data.shape == (1, 16208 * sampling_rate // 16_000) -# @require_librosa @require_torchcodec @require_sndfile def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): @@ -268,7 +259,6 @@ def test_backwards_compatibility(shared_datadir): ) # can have off by one error -# @require_librosa @require_torchcodec @require_sndfile def test_dataset_with_audio_feature(shared_datadir): @@ -299,7 +289,6 @@ def test_dataset_with_audio_feature(shared_datadir): assert samples.data.shape == (1, 202311) -# @require_librosa @require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_wav(tar_wav_path): @@ -335,7 +324,6 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): assert samples.data.shape == (1, 202311) -# @require_librosa @require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): @@ -406,7 +394,6 @@ def test_dataset_with_audio_feature_with_none(): assert item["nested"]["audio"] is None -# @require_librosa @require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): @@ -437,7 +424,6 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): assert samples.data.shape == (1, 73401) -# @require_librosa @require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): @@ -468,7 +454,6 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): assert samples.data.shape == (1, 40124) # (1, 40125) -# @require_librosa @require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): @@ -503,7 +488,6 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): assert samples.data.shape == (1, 73401) -# @require_librosa @require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): @@ -565,7 +549,6 @@ def test_dataset_cast_to_audio_features(shared_datadir, build_data): assert isinstance(item["audio"], AudioDecoder) -# @require_librosa def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -583,7 +566,6 @@ def test_dataset_concatenate_audio_features(shared_datadir): ) -# @require_librosa def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -627,7 +609,6 @@ def process_text(example): assert item == {"audio": expected_audio, "text": "Hello World!"} -# @require_librosa @require_sndfile @require_torchcodec def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): @@ -659,7 +640,6 @@ def process_audio_sampling_rate_by_batch(batch): assert item["double_sampling_rate"] == 88200 -# @require_librosa @require_torchcodec @require_sndfile def test_formatted_dataset_with_audio_feature(shared_datadir): @@ -726,7 +706,6 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory): return path -# @require_librosa @require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index 3e7c1c61617..18169a352df 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -10,7 +10,7 @@ from datasets.download.streaming_download_manager import StreamingDownloadManager from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig -from ..utils import require_librosa, require_sndfile +from ..utils import require_sndfile, require_torchcodec @pytest.fixture @@ -149,7 +149,6 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file): @pytest.fixture def data_files_with_zip_archives(tmp_path, audio_file): - import librosa import soundfile as sf data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" @@ -164,7 +163,7 @@ def data_files_with_zip_archives(tmp_path, audio_file): audio_filename2 = subdir / "audio_file2.wav" # in subdir # make sure they're two different audios # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode - array, sampling_rate = librosa.load(str(audio_filename), sr=16000) # original sampling rate is 44100 + array, sampling_rate = sf.read(str(audio_filename), sr=16000) # original sampling rate is 44100 sf.write(str(audio_filename2), array, samplerate=16000) audio_metadata_filename = archive_dir / "metadata.jsonl" @@ -199,7 +198,7 @@ def test_config_raises_when_invalid_data_files(data_files) -> None: _ = AudioFolderConfig(name="name", data_files=data_files) -@require_librosa +@require_torchcodec @require_sndfile # check that labels are inferred correctly from dir names def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir): @@ -265,7 +264,7 @@ def test_generate_examples_drop_metadata(audio_file_with_metadata, drop_metadata assert example[column] is not None -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata): @@ -284,7 +283,7 @@ def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_fi assert all(example["text"] is not None for example in dataset) -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata): @@ -303,7 +302,7 @@ def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data assert all(example["text"] is not None for example in dataset) -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index 92cd763770a..12aa6275382 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -7,7 +7,6 @@ from datasets.packaged_modules.webdataset.webdataset import WebDataset from ..utils import ( - require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, @@ -166,7 +165,6 @@ def test_image_webdataset_missing_keys(image_wds_file): @require_torchcodec -@require_librosa @require_sndfile def test_audio_webdataset(audio_wds_file): from torchcodec.decoders import AudioDecoder diff --git a/tests/test_formatting.py b/tests/test_formatting.py index a7194b4b667..9b4d9f235d4 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -20,13 +20,13 @@ from .utils import ( require_jax, - require_librosa, require_numpy1_on_windows, require_pil, require_polars, require_sndfile, require_tf, require_torch, + require_torchcodec, ) @@ -309,7 +309,7 @@ def test_numpy_formatter_image(self): self.assertEqual(batch["image"][0].dtype, np.uint8) self.assertEqual(batch["image"][0].shape, (480, 640, 3)) - @require_librosa + @require_torchcodec @require_sndfile def test_numpy_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) @@ -432,7 +432,7 @@ def test_torch_formatter_image(self): self.assertEqual(batch["image"][0].shape, (3, 480, 640)) @require_torch - @require_librosa + @require_torchcodec @require_sndfile def test_torch_formatter_audio(self): import torch @@ -619,7 +619,7 @@ def test_jax_formatter_image(self): self.assertEqual(batch["image"][0].shape, (480, 640, 3)) @require_jax - @require_librosa + @require_torchcodec @require_sndfile def test_jax_formatter_audio(self): import jax.numpy as jnp diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 6c834e22412..b118f174264 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -38,7 +38,7 @@ from datasets.utils.hub import hf_dataset_url from .fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN -from .utils import for_all_test_methods, require_librosa, require_pil, require_sndfile, xfail_if_500_502_http_error +from .utils import for_all_test_methods, require_pil, require_sndfile, require_torchcodec, xfail_if_500_502_http_error pytestmark = pytest.mark.integration @@ -387,7 +387,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo): assert ds.features == hub_ds.features assert ds[:] == hub_ds[:] - @require_librosa + @require_torchcodec @require_sndfile def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") diff --git a/tests/utils.py b/tests/utils.py index 6aa08a75e2d..66341e70220 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -48,7 +48,6 @@ def parse_flag_from_env(key, default=False): require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard") # Audio -require_librosa = pytest.mark.skipif(find_spec("librosa") is None, reason="test requires librosa") require_sndfile = pytest.mark.skipif( # On Windows and OS X, soundfile installs sndfile find_spec("soundfile") is None or version.parse(importlib.metadata.version("soundfile")) < version.parse("0.12.0"), From 3ee5f9024065f5339b36f57a2d83affd448619b5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 16:32:45 +0200 Subject: [PATCH 14/25] no torchcodec for windows --- setup.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index b1553e14797..6a587d4ce9e 100644 --- a/setup.py +++ b/setup.py @@ -184,14 +184,11 @@ "transformers>=4.42.0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", - "torchvision", - "av", + "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced + "soundfile>=0.12.1", + "torchcodec>=0.4.0; sys_platform == 'win32'", # not available for windows ] - -TESTS_REQUIRE.extend(VISION_REQUIRE) -TESTS_REQUIRE.extend(AUDIO_REQUIRE) - NUMPY2_INCOMPATIBLE_LIBRARIES = [ "faiss-cpu", "tensorflow", From eb6324c0f43c48b109fd0fea59ab9f8e55e395c1 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 16:35:35 +0200 Subject: [PATCH 15/25] further cleaning --- setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/setup.py b/setup.py index 6a587d4ce9e..56adefc38b0 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,6 @@ "py7zr", "rarfile>=4.0", "sqlalchemy", - "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1; test only on python 3.7 for now "protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12 "tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'", # numpy-2 is not supported for Python < 3.10 "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'", # Pins numpy < 2 @@ -200,8 +199,6 @@ QUALITY_REQUIRE = ["ruff>=0.3.0"] DOCS_REQUIRE = [ - # Might need to add doc-builder and some specific deps in the future - "s3fs", # Following dependencies are required for the Python reference to be built properly "transformers", "torch", @@ -219,7 +216,6 @@ "tensorflow_gpu": ["tensorflow>=2.6.0"], "torch": ["torch"], "jax": ["jax>=0.3.14", "jaxlib>=0.3.14"], - "s3": ["s3fs"], "streaming": [], # for backward compatibility "dev": TESTS_REQUIRE + QUALITY_REQUIRE + DOCS_REQUIRE, "tests": TESTS_REQUIRE, From 8a1e0bc62d27526b7d156644f4a525ac3d463892 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 16:53:31 +0200 Subject: [PATCH 16/25] fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 56adefc38b0..b44b3e4deca 100644 --- a/setup.py +++ b/setup.py @@ -185,7 +185,7 @@ "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced "soundfile>=0.12.1", - "torchcodec>=0.4.0; sys_platform == 'win32'", # not available for windows + "torchcodec>=0.4.0; sys_platform != 'win32'", # not available for windows ] NUMPY2_INCOMPATIBLE_LIBRARIES = [ From 661b5741268caff39c889e21422875b2b3312088 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 17:17:48 +0200 Subject: [PATCH 17/25] install ffmpeg in ci --- .github/workflows/ci.yml | 9 +++-- tests/features/test_audio.py | 4 ++ tests/features/test_video.py | 1 + tests/packaged_modules/test_audiofolder.py | 5 ++- tests/test_load.py | 46 ---------------------- 5 files changed, 15 insertions(+), 50 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d299ceb4f2b..24b41285d06 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,15 +44,14 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + uses: AnimMouse/setup-ffmpeg@v1 - name: Set up Python 3.9 uses: actions/setup-python@v5 with: python-version: "3.9" - name: Upgrade pip run: python -m pip install --upgrade pip - - name: Pin setuptools-scm - if: ${{ matrix.os == 'ubuntu-latest' }} - run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2" - name: Install uv run: pip install --upgrade uv - name: Install dependencies @@ -80,6 +79,8 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + uses: AnimMouse/setup-ffmpeg@v1 - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -107,6 +108,8 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + uses: AnimMouse/setup-ffmpeg@v1 - name: Set up Python 3.11 uses: actions/setup-python@v5 with: diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 17c8610124b..808ca976563 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -549,6 +549,8 @@ def test_dataset_cast_to_audio_features(shared_datadir, build_data): assert isinstance(item["audio"], AudioDecoder) +@require_torchcodec +@require_sndfile def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -566,6 +568,8 @@ def test_dataset_concatenate_audio_features(shared_datadir): ) +@require_torchcodec +@require_sndfile def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") diff --git a/tests/features/test_video.py b/tests/features/test_video.py index 1acaf253bd4..64c1441227c 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -93,6 +93,7 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): # Dataset casting and mapping +@require_torchcodec def test_dataset_with_video_feature_map_is_decoded(shared_datadir): video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path], "text": ["Hello"]} diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index 18169a352df..b29439f8c73 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -150,6 +150,7 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file): @pytest.fixture def data_files_with_zip_archives(tmp_path, audio_file): import soundfile as sf + from torchcodec.decoders import AudioDecoder data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) @@ -163,7 +164,9 @@ def data_files_with_zip_archives(tmp_path, audio_file): audio_filename2 = subdir / "audio_file2.wav" # in subdir # make sure they're two different audios # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode - array, sampling_rate = sf.read(str(audio_filename), sr=16000) # original sampling rate is 44100 + audio = AudioDecoder(audio_filename, sample_rate=16000) # original sampling rate is 44100 + samples = audio.get_all_samples() + array = samples.data sf.write(str(audio_filename2), array, samplerate=16000) audio_metadata_filename = archive_dir / "metadata.jsonl" diff --git a/tests/test_load.py b/tests/test_load.py index c2f17b7b1b7..a532452eb4c 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -43,7 +43,6 @@ assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, offline, - require_moto, require_pil, require_sndfile, set_current_working_directory_to_temp_dir, @@ -1213,51 +1212,6 @@ def test_load_dataset_local_with_default_in_memory(max_in_memory_dataset_size, d assert (dataset["train"].dataset_size < max_in_memory_dataset_size) is expected_in_memory -@pytest.fixture -def moto_server(monkeypatch): - from moto.server import ThreadedMotoServer - - monkeypatch.setattr( - "os.environ", - { - "AWS_ENDPOINT_URL": "http://localhost:5000", - "AWS_DEFAULT_REGION": "us-east-1", - "AWS_ACCESS_KEY_ID": "FOO", - "AWS_SECRET_ACCESS_KEY": "BAR", - }, - ) - server = ThreadedMotoServer() - server.start() - try: - yield - finally: - server.stop() - - -@require_moto -def test_load_file_from_s3(moto_server): - # we need server mode here because of an aiobotocore incompatibility with moto.mock_aws - # (https://github.com/getmoto/moto/issues/6836) - import boto3 - - # Create a mock S3 bucket - bucket_name = "test-bucket" - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket=bucket_name) - - # Upload a file to the mock bucket - key = "test-file.csv" - csv_data = "Island\nIsabela\nBaltra" - - s3.put_object(Bucket=bucket_name, Key=key, Body=csv_data) - - # Load the file from the mock bucket - ds = datasets.load_dataset("csv", data_files={"train": "s3://test-bucket/test-file.csv"}) - - # Check if the loaded content matches the original content - assert list(ds["train"]) == [{"Island": "Isabela"}, {"Island": "Baltra"}] - - @pytest.mark.integration def test_remote_data_files(): repo_id = "hf-internal-testing/raw_jsonl" From 803626587d8f0f2177260128d9cebeb1353936c3 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 17:50:10 +0200 Subject: [PATCH 18/25] fix ffmpeg installation --- .github/workflows/ci.yml | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 24b41285d06..47eb25c6866 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,13 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: AnimMouse/setup-ffmpeg@v1 + uses: FedericoCarboni/setup-ffmpeg@v3 + id: setup-ffmpeg + with: + ffmpeg-version: release + # As of version 3 of this action, builds are no longer downloaded from GitHub + # except on Windows: https://github.com/GyanD/codexffmpeg/releases. + github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} - name: Set up Python 3.9 uses: actions/setup-python@v5 with: @@ -80,7 +86,13 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: AnimMouse/setup-ffmpeg@v1 + uses: FedericoCarboni/setup-ffmpeg@v3 + id: setup-ffmpeg + with: + ffmpeg-version: release + # As of version 3 of this action, builds are no longer downloaded from GitHub + # except on Windows: https://github.com/GyanD/codexffmpeg/releases. + github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -109,7 +121,13 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: AnimMouse/setup-ffmpeg@v1 + uses: FedericoCarboni/setup-ffmpeg@v3 + id: setup-ffmpeg + with: + ffmpeg-version: release + # As of version 3 of this action, builds are no longer downloaded from GitHub + # except on Windows: https://github.com/GyanD/codexffmpeg/releases. + github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From b582c5bf6b2d62fdad9cb762bbf9b2cb7ee41103 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 18:02:18 +0200 Subject: [PATCH 19/25] fix mono backward compatibility --- src/datasets/features/_torchcodec.py | 4 +++- src/datasets/features/audio.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/_torchcodec.py b/src/datasets/features/_torchcodec.py index daf6b686f64..9a1b1fdf8e0 100644 --- a/src/datasets/features/_torchcodec.py +++ b/src/datasets/features/_torchcodec.py @@ -1,10 +1,12 @@ +import numpy as np from torchcodec.decoders import AudioDecoder as _AudioDecoder class AudioDecoder(_AudioDecoder): def __getitem__(self, key: str): if key == "array": - return self.get_all_samples().data.cpu().numpy() + y = self.get_all_samples().data.cpu().numpy() + return np.mean(y, axis=tuple(range(y.ndim - 1))) if getattr(self, "_mono", True) else y elif key == "sampling_rate": return self.get_samples_played_in_range(0, 0).sample_rate elif hasattr(super(), "__getitem__"): diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 9fb2120abd2..a2d5e700e03 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -204,6 +204,7 @@ def decode_example( audio = AudioDecoder( bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels ) + audio._mono = self.mono audio._hf_encoded = {"path": path, "bytes": bytes} audio.metadata.path = path return audio From 4e265db7cc0cc0be5f2e2b880b4365e8aba11bf0 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 18:09:05 +0200 Subject: [PATCH 20/25] fix ffmpeg --- .github/workflows/ci.yml | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 47eb25c6866..21d6f78def6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,13 +45,8 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: FedericoCarboni/setup-ffmpeg@v3 - id: setup-ffmpeg - with: - ffmpeg-version: release - # As of version 3 of this action, builds are no longer downloaded from GitHub - # except on Windows: https://github.com/GyanD/codexffmpeg/releases. - github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} + if: ${{ matrix.os == 'ubuntu-latest' }} + run: sudo apt install -y ffmpeg - name: Set up Python 3.9 uses: actions/setup-python@v5 with: @@ -86,13 +81,8 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: FedericoCarboni/setup-ffmpeg@v3 - id: setup-ffmpeg - with: - ffmpeg-version: release - # As of version 3 of this action, builds are no longer downloaded from GitHub - # except on Windows: https://github.com/GyanD/codexffmpeg/releases. - github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} + if: ${{ matrix.os == 'ubuntu-latest' }} + run: sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -121,13 +111,8 @@ jobs: with: fetch-depth: 0 - name: Setup FFmpeg - uses: FedericoCarboni/setup-ffmpeg@v3 - id: setup-ffmpeg - with: - ffmpeg-version: release - # As of version 3 of this action, builds are no longer downloaded from GitHub - # except on Windows: https://github.com/GyanD/codexffmpeg/releases. - github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }} + if: ${{ matrix.os == 'ubuntu-latest' }} + run: sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From f043c0cb95a44d3c4c245d2fa7507d5903d51c44 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 18:22:56 +0200 Subject: [PATCH 21/25] again --- .github/workflows/ci.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21d6f78def6..a4a9041fd28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,7 +46,9 @@ jobs: fetch-depth: 0 - name: Setup FFmpeg if: ${{ matrix.os == 'ubuntu-latest' }} - run: sudo apt install -y ffmpeg + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.9 uses: actions/setup-python@v5 with: @@ -82,7 +84,9 @@ jobs: fetch-depth: 0 - name: Setup FFmpeg if: ${{ matrix.os == 'ubuntu-latest' }} - run: sudo apt install -y ffmpeg + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -112,7 +116,9 @@ jobs: fetch-depth: 0 - name: Setup FFmpeg if: ${{ matrix.os == 'ubuntu-latest' }} - run: sudo apt install -y ffmpeg + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From 37763db199a19129cffd08dc3d86c017bc92cb1d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 18:37:21 +0200 Subject: [PATCH 22/25] fix mono backward compat --- src/datasets/features/_torchcodec.py | 2 +- src/datasets/features/audio.py | 15 +++------------ tests/features/test_audio.py | 12 ++++-------- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/src/datasets/features/_torchcodec.py b/src/datasets/features/_torchcodec.py index 9a1b1fdf8e0..d71b4156a51 100644 --- a/src/datasets/features/_torchcodec.py +++ b/src/datasets/features/_torchcodec.py @@ -6,7 +6,7 @@ class AudioDecoder(_AudioDecoder): def __getitem__(self, key: str): if key == "array": y = self.get_all_samples().data.cpu().numpy() - return np.mean(y, axis=tuple(range(y.ndim - 1))) if getattr(self, "_mono", True) else y + return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y elif key == "sampling_rate": return self.get_samples_played_in_range(0, 0).sample_rate elif hasattr(super(), "__getitem__"): diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index a2d5e700e03..3810c44dbe9 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -75,7 +75,6 @@ class Audio: """ sampling_rate: Optional[int] = None - mono: bool = True decode: bool = True stream_index: Optional[int] = None id: Optional[str] = field(default=None, repr=False) @@ -179,11 +178,8 @@ def decode_example( if path is None and bytes is None: raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.") - channels = 1 if self.mono else None if bytes is None and is_local_path(path): - audio = AudioDecoder( - path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels - ) + audio = AudioDecoder(path, stream_index=self.stream_index, sample_rate=self.sampling_rate) elif bytes is None: token_per_repo_id = token_per_repo_id or {} @@ -196,15 +192,10 @@ def decode_example( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - audio = AudioDecoder( - f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels - ) + audio = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate) else: - audio = AudioDecoder( - bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=channels - ) - audio._mono = self.mono + audio = AudioDecoder(bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate) audio._hf_encoded = {"path": path, "bytes": bytes} audio.metadata.path = path return audio diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 808ca976563..4296d2543a4 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -244,19 +244,15 @@ def test_backwards_compatibility(shared_datadir): assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert decoded_example["sampling_rate"] == samples.sample_rate - assert ( - decoded_example["array"].shape[0] == samples.data.shape[0] - and abs(decoded_example["array"].shape[1] - samples.data.shape[1]) < 2 - ) # can have off by one error + assert decoded_example["array"].ndim == 1 # mono + assert abs(decoded_example["array"].shape[0] - samples.data.shape[1]) < 2 # can have off by one error decoded_example = audio.decode_example(audio.encode_example(audio_path2)) assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert decoded_example["sampling_rate"] == samples.sample_rate - assert ( - decoded_example["array"].shape[0] == samples.data.shape[0] - and abs(decoded_example["array"].shape[1] - samples.data.shape[1]) < 2 - ) # can have off by one error + assert decoded_example["array"].ndim == 1 # mono + assert abs(decoded_example["array"].shape[0] - samples.data.shape[1]) < 2 # can have off by one error @require_torchcodec From 5198748763643ecd31f58ca38d72f3da554ec784 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 18:47:36 +0200 Subject: [PATCH 23/25] fix tests --- src/datasets/features/audio.py | 6 +++--- tests/packaged_modules/test_audiofolder.py | 4 ++-- tmp.wav | Bin 0 -> 64 bytes 3 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 tmp.wav diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 3810c44dbe9..d982e329326 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -119,7 +119,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder elif "array" in value: # convert the audio array to wav bytes buffer = BytesIO() - sf.write(buffer, value["array"], value["sampling_rate"], format="wav") + sf.write(buffer, value["array"].T, value["sampling_rate"], format="wav") return {"bytes": buffer.getvalue(), "path": None} elif value.get("path") is not None and os.path.isfile(value["path"]): # we set "bytes": None to not duplicate the data if they're already available locally @@ -300,7 +300,7 @@ def encode_torchcodec_audio(audio: "AudioDecoder") -> dict: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err samples = audio.get_all_samples() - array = samples.data.cpu().numpy().T + array = samples.data.cpu().numpy() buffer = BytesIO() - sf.write(buffer, array, samples.sample_rate, format="wav") + sf.write(buffer, array.T, samples.sample_rate, format="wav") return {"bytes": buffer.getvalue(), "path": None} diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index b29439f8c73..bf05fa12f6d 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -166,8 +166,8 @@ def data_files_with_zip_archives(tmp_path, audio_file): # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode audio = AudioDecoder(audio_filename, sample_rate=16000) # original sampling rate is 44100 samples = audio.get_all_samples() - array = samples.data - sf.write(str(audio_filename2), array, samplerate=16000) + array = samples.data.cpu().numpy() + sf.write(str(audio_filename2), array.T, samplerate=16000) audio_metadata_filename = archive_dir / "metadata.jsonl" audio_metadata = textwrap.dedent( diff --git a/tmp.wav b/tmp.wav new file mode 100644 index 0000000000000000000000000000000000000000..4cd0bb4b07eb080c6d91e536e6dbe74b39e8b35a GIT binary patch literal 64 vcmWIYbaS&{U| Date: Thu, 19 Jun 2025 19:08:16 +0200 Subject: [PATCH 24/25] fix tests --- src/datasets/packaged_modules/json/README.md | 30 -------- tests/features/test_audio.py | 72 ++++++++++---------- 2 files changed, 36 insertions(+), 66 deletions(-) delete mode 100644 src/datasets/packaged_modules/json/README.md diff --git a/src/datasets/packaged_modules/json/README.md b/src/datasets/packaged_modules/json/README.md deleted file mode 100644 index a07cb902a4f..00000000000 --- a/src/datasets/packaged_modules/json/README.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -dataset_info: - features: - - name: tokens - list: string - - name: ner_tags - list: - class_label: - names: - '0': O - '1': B-PER - '2': I-PER - '3': B-ORG - '4': I-ORG - '5': B-LOC - '6': I-LOC - - name: langs - list: string - - name: spans - list: string - splits: - - name: train - num_bytes: 2351563 - num_examples: 10000 - - name: validation - num_bytes: 238418 - num_examples: 1000 - download_size: 3940680 - dataset_size: 2589981 ---- diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 4296d2543a4..38999e64b4e 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -2,6 +2,7 @@ import tarfile from itertools import product +import numpy as np import pyarrow as pa import pytest @@ -40,7 +41,6 @@ def iter_archive(archive_path): def test_audio_instantiation(): audio = Audio() assert audio.sampling_rate is None - assert audio.mono is True assert audio.id is None assert audio.stream_index is None @@ -70,7 +70,7 @@ def test_audio_feature_type_to_arrow(): lambda audio_path: {"path": audio_path, "bytes": open(audio_path, "rb").read()}, lambda audio_path: {"path": None, "bytes": open(audio_path, "rb").read()}, lambda audio_path: {"bytes": open(audio_path, "rb").read()}, - lambda audio_path: {"array": [0.1, 0.2, 0.3], "sampling_rate": 16_000}, + lambda audio_path: {"array": np.array([0.1, 0.2, 0.3]), "sampling_rate": 16_000}, ], ) def test_audio_feature_encode_example(shared_datadir, build_example): @@ -94,7 +94,7 @@ def test_audio_feature_encode_example(shared_datadir, build_example): lambda audio_path: {"path": audio_path, "sampling_rate": 16_000}, lambda audio_path: {"path": audio_path, "bytes": None, "sampling_rate": 16_000}, lambda audio_path: {"path": audio_path, "bytes": open(audio_path, "rb").read(), "sampling_rate": 16_000}, - lambda audio_path: {"array": [0.1, 0.2, 0.3], "sampling_rate": 16_000}, + lambda audio_path: {"array": np.array([0.1, 0.2, 0.3]), "sampling_rate": 16_000}, ], ) def test_audio_feature_encode_example_pcm(shared_datadir, build_example): @@ -144,7 +144,7 @@ def test_audio_decode_example(shared_datadir): assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) with pytest.raises(RuntimeError): Audio(decode=False).decode_example(audio_path) @@ -161,7 +161,7 @@ def test_audio_resampling(shared_datadir): assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) @require_torchcodec @@ -176,7 +176,7 @@ def test_audio_decode_example_mp3(shared_datadir): assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 110592) + assert samples.data.shape == (2, 110592) @require_torchcodec @@ -222,13 +222,13 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 48000 - assert samples.data.shape == (1, 120373) + assert samples.data.shape == (2, 120373) decoded_example = audio.decode_example(audio.encode_example(audio_path2)) assert isinstance(decoded_example, AudioDecoder) samples = decoded_example.get_all_samples() assert samples.sample_rate == 48000 - assert samples.data.shape == (1, 122688) + assert samples.data.shape == (2, 122688) @require_torchcodec @@ -269,20 +269,20 @@ def test_dataset_with_audio_feature(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) @require_torchcodec @@ -302,7 +302,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} @@ -310,14 +310,14 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) @require_torchcodec @@ -337,7 +337,7 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 110592) + assert samples.data.shape == (2, 110592) assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} @@ -345,14 +345,14 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 110592) + assert samples.data.shape == (2, 110592) assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 110592) + assert samples.data.shape == (2, 110592) @require_torchcodec @@ -404,20 +404,20 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) @require_torchcodec @@ -434,20 +434,20 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) @require_torchcodec @@ -468,20 +468,20 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 73401) + assert samples.data.shape == (2, 73401) @require_torchcodec @@ -502,20 +502,20 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) column = dset["audio"] assert len(column) == 1 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 16000 - assert samples.data.shape == (1, 40124) # (1, 40125) + assert samples.data.shape == (2, 40124) @require_torchcodec @@ -655,20 +655,20 @@ def test_formatted_dataset_with_audio_feature(shared_datadir): assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 2 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) with dset.formatted_as("pandas"): item = dset[0] @@ -677,20 +677,20 @@ def test_formatted_dataset_with_audio_feature(shared_datadir): assert isinstance(item["audio"][0], AudioDecoder) samples = item["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["audio"] assert isinstance(batch["audio"][0], AudioDecoder) samples = batch["audio"][0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 2 assert isinstance(column[0], AudioDecoder) samples = column[0].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) @pytest.fixture @@ -721,7 +721,7 @@ def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, sh assert isinstance(item["audio"], AudioDecoder) samples = item["audio"].get_all_samples() assert samples.sample_rate == 44100 - assert samples.data.shape == (1, 202311) + assert samples.data.shape == (2, 202311) assert item["audio"].metadata.path == audio_path From 4a637bd963557bca42d3a2d8c54200fccb7ab91e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 19 Jun 2025 19:57:23 +0200 Subject: [PATCH 25/25] again --- tests/fixtures/files.py | 10 ++++++++++ tests/packaged_modules/test_audiofolder.py | 22 +++++++-------------- tmp.wav | Bin 64 -> 0 bytes 3 files changed, 17 insertions(+), 15 deletions(-) delete mode 100644 tmp.wav diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index fe9549c30b4..25b1448ae46 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -551,6 +551,16 @@ def audio_file(): return os.path.join("tests", "features", "data", "test_audio_44100.wav") +@pytest.fixture(scope="session") +def audio_file_44100(): + return os.path.join("tests", "features", "data", "test_audio_44100.mp3") + + +@pytest.fixture(scope="session") +def audio_file_16000(): + return os.path.join("tests", "features", "data", "test_audio_16000.mp3") + + @pytest.fixture(scope="session") def tensor_file(tmp_path_factory): import torch diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index bf05fa12f6d..1dc87c1f6f0 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -148,10 +148,7 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file): @pytest.fixture -def data_files_with_zip_archives(tmp_path, audio_file): - import soundfile as sf - from torchcodec.decoders import AudioDecoder - +def data_files_with_zip_archives(tmp_path, audio_file_44100, audio_file_16000): data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) archive_dir = data_dir / "archive" @@ -159,21 +156,16 @@ def data_files_with_zip_archives(tmp_path, audio_file): subdir = archive_dir / "subdir" subdir.mkdir(parents=True, exist_ok=True) - audio_filename = archive_dir / "audio_file.wav" - shutil.copyfile(audio_file, audio_filename) - audio_filename2 = subdir / "audio_file2.wav" # in subdir - # make sure they're two different audios - # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode - audio = AudioDecoder(audio_filename, sample_rate=16000) # original sampling rate is 44100 - samples = audio.get_all_samples() - array = samples.data.cpu().numpy() - sf.write(str(audio_filename2), array.T, samplerate=16000) + audio_filename = archive_dir / "audio_file.mp3" + shutil.copyfile(audio_file_44100, audio_filename) + audio_filename2 = subdir / "audio_file2.mp3" # in subdir + shutil.copyfile(audio_file_16000, audio_filename2) audio_metadata_filename = archive_dir / "metadata.jsonl" audio_metadata = textwrap.dedent( """\ - {"file_name": "audio_file.wav", "text": "First audio transcription"} - {"file_name": "subdir/audio_file2.wav", "text": "Second audio transcription (in subdir)"} + {"file_name": "audio_file.mp3", "text": "First audio transcription"} + {"file_name": "subdir/audio_file2.mp3", "text": "Second audio transcription (in subdir)"} """ ) diff --git a/tmp.wav b/tmp.wav deleted file mode 100644 index 4cd0bb4b07eb080c6d91e536e6dbe74b39e8b35a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 64 vcmWIYbaS&{U|