diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 59451a640e6..06698347f02 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1847,6 +1847,7 @@ def save_to_disk( # if we have only a few large samples, we should only create as many shards as samples num_shards = min(len(self.data), num_shards) + dataset_path = str(dataset_path) fs: fsspec.AbstractFileSystem fs, _ = url_to_fs(dataset_path, **(storage_options or {})) @@ -2018,6 +2019,7 @@ def load_from_disk( >>> ds = load_from_disk("path/to/dataset/directory") ``` """ + dataset_path = str(dataset_path) fs: fsspec.AbstractFileSystem fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {})) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index acf25a06307..27cfaeb41d9 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -329,6 +329,13 @@ def test_dummy_dataset_serialize(self, in_memory): self.assertEqual(dset[0]["filename"], "my_name-train_0") self.assertEqual(dset["filename"][0], "my_name-train_0") + with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset: + dataset_path = Path(tmp_dir) / "my_dataset_pathlib" + dset.save_to_disk(dataset_path) + + with Dataset.load_from_disk(dataset_path) as dset: + self.assertEqual(len(dset), 10) + with self._create_dummy_dataset(in_memory, tmp_dir).select( range(10), indices_cache_file_name=os.path.join(tmp_dir, "ind.arrow") ) as dset: