Skip to content

Commit 2833a08

Browse files
committed
Refactor unit tests and test non-positive size
1 parent dad8fbf commit 2833a08

1 file changed

Lines changed: 122 additions & 83 deletions

File tree

tests/utils/test_manifest.py

Lines changed: 122 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
# under the License.
1717
# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818
import importlib
19+
from pathlib import Path
1920
from tempfile import TemporaryDirectory
21+
from typing import Any
2022

2123
import fastavro
2224
import pytest
@@ -978,52 +980,56 @@ def test_inherit_from_manifest_snapshot_id() -> None:
978980
assert result.snapshot_id == 3051729675574597004
979981
assert result.sequence_number == 1
980982
assert result.file_sequence_number == 1
983+
def _create_test_manifest_list(module: Any, io: PyArrowFileIO, tmp_dir: str, name: str, snapshot_id: int) -> str:
984+
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
985+
spec = UNPARTITIONED_PARTITION_SPEC
986+
987+
manifest_path = f"{tmp_dir}/manifest-{name}.avro"
988+
with module.write_manifest(
989+
format_version=2,
990+
spec=spec,
991+
schema=schema,
992+
output_file=io.new_output(manifest_path),
993+
snapshot_id=snapshot_id,
994+
avro_compression="zstandard",
995+
) as writer:
996+
data_file = module.DataFile.from_args(
997+
content=module.DataFileContent.DATA,
998+
file_path=f"{tmp_dir}/data-{name}.parquet",
999+
file_format=module.FileFormat.PARQUET,
1000+
partition=Record(),
1001+
record_count=100,
1002+
file_size_in_bytes=1000,
1003+
)
1004+
writer.add_entry(
1005+
module.ManifestEntry.from_args(
1006+
status=module.ManifestEntryStatus.ADDED,
1007+
snapshot_id=snapshot_id,
1008+
data_file=data_file,
1009+
)
1010+
)
1011+
manifest_file = writer.to_manifest_file()
1012+
1013+
list_path = f"{tmp_dir}/manifest-list-{name}.avro"
1014+
with module.write_manifest_list(
1015+
format_version=2,
1016+
output_file=io.new_output(list_path),
1017+
snapshot_id=snapshot_id,
1018+
parent_snapshot_id=snapshot_id - 1 if snapshot_id > 1 else None,
1019+
sequence_number=snapshot_id,
1020+
avro_compression="zstandard",
1021+
) as list_writer:
1022+
list_writer.add_manifests([manifest_file])
1023+
1024+
return list_path
1025+
1026+
9811027
def test_clear_manifest_cache() -> None:
9821028
"""Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
9831029
io = PyArrowFileIO()
9841030

9851031
with TemporaryDirectory() as tmp_dir:
986-
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
987-
spec = UNPARTITIONED_PARTITION_SPEC
988-
989-
# Create a manifest file
990-
manifest_path = f"{tmp_dir}/manifest.avro"
991-
with write_manifest(
992-
format_version=2,
993-
spec=spec,
994-
schema=schema,
995-
output_file=io.new_output(manifest_path),
996-
snapshot_id=1,
997-
avro_compression="zstandard",
998-
) as writer:
999-
data_file = DataFile.from_args(
1000-
content=DataFileContent.DATA,
1001-
file_path=f"{tmp_dir}/data.parquet",
1002-
file_format=FileFormat.PARQUET,
1003-
partition=Record(),
1004-
record_count=100,
1005-
file_size_in_bytes=1000,
1006-
)
1007-
writer.add_entry(
1008-
ManifestEntry.from_args(
1009-
status=ManifestEntryStatus.ADDED,
1010-
snapshot_id=1,
1011-
data_file=data_file,
1012-
)
1013-
)
1014-
manifest_file = writer.to_manifest_file()
1015-
1016-
# Create a manifest list
1017-
list_path = f"{tmp_dir}/manifest-list.avro"
1018-
with write_manifest_list(
1019-
format_version=2,
1020-
output_file=io.new_output(list_path),
1021-
snapshot_id=1,
1022-
parent_snapshot_id=None,
1023-
sequence_number=1,
1024-
avro_compression="zstandard",
1025-
) as list_writer:
1026-
list_writer.add_manifests([manifest_file])
1032+
list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="clear", snapshot_id=1)
10271033

10281034
# Populate the cache
10291035
_manifests(io, list_path)
@@ -1042,9 +1048,10 @@ def test_clear_manifest_cache() -> None:
10421048
assert len(cache_after) == 0, "Cache should be empty after clear"
10431049

10441050

1045-
def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.MonkeyPatch) -> None:
1046-
"""Test that setting manifest-cache-size to 0 disables caching."""
1047-
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "0")
1051+
@pytest.mark.parametrize("cache_size", ["0", "-1"])
1052+
def test_manifest_cache_can_be_disabled_with_non_positive_size(monkeypatch: pytest.MonkeyPatch, cache_size: str) -> None:
1053+
"""Test that non-positive manifest-cache-size values disable caching."""
1054+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", cache_size)
10481055
importlib.reload(manifest_module)
10491056

10501057
try:
@@ -1054,45 +1061,7 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10541061
io = PyArrowFileIO()
10551062

10561063
with TemporaryDirectory() as tmp_dir:
1057-
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
1058-
spec = UNPARTITIONED_PARTITION_SPEC
1059-
1060-
manifest_path = f"{tmp_dir}/manifest.avro"
1061-
with manifest_module.write_manifest(
1062-
format_version=2,
1063-
spec=spec,
1064-
schema=schema,
1065-
output_file=io.new_output(manifest_path),
1066-
snapshot_id=1,
1067-
avro_compression="zstandard",
1068-
) as writer:
1069-
data_file = manifest_module.DataFile.from_args(
1070-
content=manifest_module.DataFileContent.DATA,
1071-
file_path=f"{tmp_dir}/data.parquet",
1072-
file_format=manifest_module.FileFormat.PARQUET,
1073-
partition=Record(),
1074-
record_count=100,
1075-
file_size_in_bytes=1000,
1076-
)
1077-
writer.add_entry(
1078-
manifest_module.ManifestEntry.from_args(
1079-
status=manifest_module.ManifestEntryStatus.ADDED,
1080-
snapshot_id=1,
1081-
data_file=data_file,
1082-
)
1083-
)
1084-
manifest_file = writer.to_manifest_file()
1085-
1086-
list_path = f"{tmp_dir}/manifest-list.avro"
1087-
with manifest_module.write_manifest_list(
1088-
format_version=2,
1089-
output_file=io.new_output(list_path),
1090-
snapshot_id=1,
1091-
parent_snapshot_id=None,
1092-
sequence_number=1,
1093-
avro_compression="zstandard",
1094-
) as list_writer:
1095-
list_writer.add_manifests([manifest_file])
1064+
list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="disabled", snapshot_id=1)
10961065

10971066
manifests_first_call = manifest_module._manifests(io, list_path)
10981067
manifests_second_call = manifest_module._manifests(io, list_path)
@@ -1102,3 +1071,73 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
11021071
finally:
11031072
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
11041073
importlib.reload(manifest_module)
1074+
1075+
1076+
def test_manifest_cache_respects_positive_env_size(monkeypatch: pytest.MonkeyPatch) -> None:
1077+
"""Test that a positive manifest-cache-size enables a bounded cache."""
1078+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "1")
1079+
importlib.reload(manifest_module)
1080+
1081+
try:
1082+
assert manifest_module._manifest_cache_size == 1
1083+
1084+
io = PyArrowFileIO()
1085+
1086+
with TemporaryDirectory() as tmp_dir:
1087+
first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1)
1088+
second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2)
1089+
1090+
manifests_first_call = manifest_module._manifests(io, first_list_path)
1091+
manifests_second_call = manifest_module._manifests(io, first_list_path)
1092+
1093+
assert manifests_first_call[0] is manifests_second_call[0]
1094+
assert len(manifest_module._manifest_cache) == 1
1095+
1096+
manifest_module._manifests(io, second_list_path)
1097+
1098+
assert len(manifest_module._manifest_cache) == 1
1099+
finally:
1100+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1101+
importlib.reload(manifest_module)
1102+
1103+
1104+
def test_manifest_cache_reads_size_from_configuration_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
1105+
"""Test that manifest-cache-size can be loaded from .pyiceberg.yaml."""
1106+
config_dir = tmp_path / "config"
1107+
config_dir.mkdir()
1108+
(config_dir / ".pyiceberg.yaml").write_text("manifest-cache-size: 2\n", encoding="utf-8")
1109+
1110+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1111+
monkeypatch.setenv("PYICEBERG_HOME", str(config_dir))
1112+
importlib.reload(manifest_module)
1113+
1114+
try:
1115+
assert manifest_module._manifest_cache_size == 2
1116+
1117+
io = PyArrowFileIO()
1118+
1119+
with TemporaryDirectory() as tmp_dir:
1120+
first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1)
1121+
second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2)
1122+
third_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="third", snapshot_id=3)
1123+
1124+
manifest_module._manifests(io, first_list_path)
1125+
manifest_module._manifests(io, second_list_path)
1126+
manifest_module._manifests(io, third_list_path)
1127+
1128+
assert len(manifest_module._manifest_cache) == 2
1129+
finally:
1130+
monkeypatch.delenv("PYICEBERG_HOME", raising=False)
1131+
importlib.reload(manifest_module)
1132+
1133+
1134+
def test_invalid_manifest_cache_size_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None:
1135+
"""Test that invalid manifest-cache-size values raise a helpful error."""
1136+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "not-an-int")
1137+
1138+
try:
1139+
with pytest.raises(ValueError, match="manifest-cache-size should be an integer or left unset"):
1140+
importlib.reload(manifest_module)
1141+
finally:
1142+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1143+
importlib.reload(manifest_module)

0 commit comments

Comments
 (0)