1616# under the License.
1717# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818import importlib
19+ from pathlib import Path
1920from tempfile import TemporaryDirectory
21+ from typing import Any
2022
2123import fastavro
2224import pytest
@@ -978,52 +980,56 @@ def test_inherit_from_manifest_snapshot_id() -> None:
978980 assert result .snapshot_id == 3051729675574597004
979981 assert result .sequence_number == 1
980982 assert result .file_sequence_number == 1
983+ def _create_test_manifest_list (module : Any , io : PyArrowFileIO , tmp_dir : str , name : str , snapshot_id : int ) -> str :
984+ schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
985+ spec = UNPARTITIONED_PARTITION_SPEC
986+
987+ manifest_path = f"{ tmp_dir } /manifest-{ name } .avro"
988+ with module .write_manifest (
989+ format_version = 2 ,
990+ spec = spec ,
991+ schema = schema ,
992+ output_file = io .new_output (manifest_path ),
993+ snapshot_id = snapshot_id ,
994+ avro_compression = "zstandard" ,
995+ ) as writer :
996+ data_file = module .DataFile .from_args (
997+ content = module .DataFileContent .DATA ,
998+ file_path = f"{ tmp_dir } /data-{ name } .parquet" ,
999+ file_format = module .FileFormat .PARQUET ,
1000+ partition = Record (),
1001+ record_count = 100 ,
1002+ file_size_in_bytes = 1000 ,
1003+ )
1004+ writer .add_entry (
1005+ module .ManifestEntry .from_args (
1006+ status = module .ManifestEntryStatus .ADDED ,
1007+ snapshot_id = snapshot_id ,
1008+ data_file = data_file ,
1009+ )
1010+ )
1011+ manifest_file = writer .to_manifest_file ()
1012+
1013+ list_path = f"{ tmp_dir } /manifest-list-{ name } .avro"
1014+ with module .write_manifest_list (
1015+ format_version = 2 ,
1016+ output_file = io .new_output (list_path ),
1017+ snapshot_id = snapshot_id ,
1018+ parent_snapshot_id = snapshot_id - 1 if snapshot_id > 1 else None ,
1019+ sequence_number = snapshot_id ,
1020+ avro_compression = "zstandard" ,
1021+ ) as list_writer :
1022+ list_writer .add_manifests ([manifest_file ])
1023+
1024+ return list_path
1025+
1026+
9811027def test_clear_manifest_cache () -> None :
9821028 """Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
9831029 io = PyArrowFileIO ()
9841030
9851031 with TemporaryDirectory () as tmp_dir :
986- schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
987- spec = UNPARTITIONED_PARTITION_SPEC
988-
989- # Create a manifest file
990- manifest_path = f"{ tmp_dir } /manifest.avro"
991- with write_manifest (
992- format_version = 2 ,
993- spec = spec ,
994- schema = schema ,
995- output_file = io .new_output (manifest_path ),
996- snapshot_id = 1 ,
997- avro_compression = "zstandard" ,
998- ) as writer :
999- data_file = DataFile .from_args (
1000- content = DataFileContent .DATA ,
1001- file_path = f"{ tmp_dir } /data.parquet" ,
1002- file_format = FileFormat .PARQUET ,
1003- partition = Record (),
1004- record_count = 100 ,
1005- file_size_in_bytes = 1000 ,
1006- )
1007- writer .add_entry (
1008- ManifestEntry .from_args (
1009- status = ManifestEntryStatus .ADDED ,
1010- snapshot_id = 1 ,
1011- data_file = data_file ,
1012- )
1013- )
1014- manifest_file = writer .to_manifest_file ()
1015-
1016- # Create a manifest list
1017- list_path = f"{ tmp_dir } /manifest-list.avro"
1018- with write_manifest_list (
1019- format_version = 2 ,
1020- output_file = io .new_output (list_path ),
1021- snapshot_id = 1 ,
1022- parent_snapshot_id = None ,
1023- sequence_number = 1 ,
1024- avro_compression = "zstandard" ,
1025- ) as list_writer :
1026- list_writer .add_manifests ([manifest_file ])
1032+ list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "clear" , snapshot_id = 1 )
10271033
10281034 # Populate the cache
10291035 _manifests (io , list_path )
@@ -1042,9 +1048,10 @@ def test_clear_manifest_cache() -> None:
10421048 assert len (cache_after ) == 0 , "Cache should be empty after clear"
10431049
10441050
1045- def test_manifest_cache_can_be_disabled_with_zero_size (monkeypatch : pytest .MonkeyPatch ) -> None :
1046- """Test that setting manifest-cache-size to 0 disables caching."""
1047- monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "0" )
1051+ @pytest .mark .parametrize ("cache_size" , ["0" , "-1" ])
1052+ def test_manifest_cache_can_be_disabled_with_non_positive_size (monkeypatch : pytest .MonkeyPatch , cache_size : str ) -> None :
1053+ """Test that non-positive manifest-cache-size values disable caching."""
1054+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , cache_size )
10481055 importlib .reload (manifest_module )
10491056
10501057 try :
@@ -1054,45 +1061,7 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10541061 io = PyArrowFileIO ()
10551062
10561063 with TemporaryDirectory () as tmp_dir :
1057- schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
1058- spec = UNPARTITIONED_PARTITION_SPEC
1059-
1060- manifest_path = f"{ tmp_dir } /manifest.avro"
1061- with manifest_module .write_manifest (
1062- format_version = 2 ,
1063- spec = spec ,
1064- schema = schema ,
1065- output_file = io .new_output (manifest_path ),
1066- snapshot_id = 1 ,
1067- avro_compression = "zstandard" ,
1068- ) as writer :
1069- data_file = manifest_module .DataFile .from_args (
1070- content = manifest_module .DataFileContent .DATA ,
1071- file_path = f"{ tmp_dir } /data.parquet" ,
1072- file_format = manifest_module .FileFormat .PARQUET ,
1073- partition = Record (),
1074- record_count = 100 ,
1075- file_size_in_bytes = 1000 ,
1076- )
1077- writer .add_entry (
1078- manifest_module .ManifestEntry .from_args (
1079- status = manifest_module .ManifestEntryStatus .ADDED ,
1080- snapshot_id = 1 ,
1081- data_file = data_file ,
1082- )
1083- )
1084- manifest_file = writer .to_manifest_file ()
1085-
1086- list_path = f"{ tmp_dir } /manifest-list.avro"
1087- with manifest_module .write_manifest_list (
1088- format_version = 2 ,
1089- output_file = io .new_output (list_path ),
1090- snapshot_id = 1 ,
1091- parent_snapshot_id = None ,
1092- sequence_number = 1 ,
1093- avro_compression = "zstandard" ,
1094- ) as list_writer :
1095- list_writer .add_manifests ([manifest_file ])
1064+ list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "disabled" , snapshot_id = 1 )
10961065
10971066 manifests_first_call = manifest_module ._manifests (io , list_path )
10981067 manifests_second_call = manifest_module ._manifests (io , list_path )
@@ -1102,3 +1071,73 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
11021071 finally :
11031072 monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
11041073 importlib .reload (manifest_module )
1074+
1075+
1076+ def test_manifest_cache_respects_positive_env_size (monkeypatch : pytest .MonkeyPatch ) -> None :
1077+ """Test that a positive manifest-cache-size enables a bounded cache."""
1078+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "1" )
1079+ importlib .reload (manifest_module )
1080+
1081+ try :
1082+ assert manifest_module ._manifest_cache_size == 1
1083+
1084+ io = PyArrowFileIO ()
1085+
1086+ with TemporaryDirectory () as tmp_dir :
1087+ first_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "first" , snapshot_id = 1 )
1088+ second_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "second" , snapshot_id = 2 )
1089+
1090+ manifests_first_call = manifest_module ._manifests (io , first_list_path )
1091+ manifests_second_call = manifest_module ._manifests (io , first_list_path )
1092+
1093+ assert manifests_first_call [0 ] is manifests_second_call [0 ]
1094+ assert len (manifest_module ._manifest_cache ) == 1
1095+
1096+ manifest_module ._manifests (io , second_list_path )
1097+
1098+ assert len (manifest_module ._manifest_cache ) == 1
1099+ finally :
1100+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1101+ importlib .reload (manifest_module )
1102+
1103+
1104+ def test_manifest_cache_reads_size_from_configuration_file (monkeypatch : pytest .MonkeyPatch , tmp_path : Path ) -> None :
1105+ """Test that manifest-cache-size can be loaded from .pyiceberg.yaml."""
1106+ config_dir = tmp_path / "config"
1107+ config_dir .mkdir ()
1108+ (config_dir / ".pyiceberg.yaml" ).write_text ("manifest-cache-size: 2\n " , encoding = "utf-8" )
1109+
1110+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1111+ monkeypatch .setenv ("PYICEBERG_HOME" , str (config_dir ))
1112+ importlib .reload (manifest_module )
1113+
1114+ try :
1115+ assert manifest_module ._manifest_cache_size == 2
1116+
1117+ io = PyArrowFileIO ()
1118+
1119+ with TemporaryDirectory () as tmp_dir :
1120+ first_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "first" , snapshot_id = 1 )
1121+ second_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "second" , snapshot_id = 2 )
1122+ third_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "third" , snapshot_id = 3 )
1123+
1124+ manifest_module ._manifests (io , first_list_path )
1125+ manifest_module ._manifests (io , second_list_path )
1126+ manifest_module ._manifests (io , third_list_path )
1127+
1128+ assert len (manifest_module ._manifest_cache ) == 2
1129+ finally :
1130+ monkeypatch .delenv ("PYICEBERG_HOME" , raising = False )
1131+ importlib .reload (manifest_module )
1132+
1133+
1134+ def test_invalid_manifest_cache_size_raises_value_error (monkeypatch : pytest .MonkeyPatch ) -> None :
1135+ """Test that invalid manifest-cache-size values raise a helpful error."""
1136+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "not-an-int" )
1137+
1138+ try :
1139+ with pytest .raises (ValueError , match = "manifest-cache-size should be an integer or left unset" ):
1140+ importlib .reload (manifest_module )
1141+ finally :
1142+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1143+ importlib .reload (manifest_module )
0 commit comments