@@ -90,39 +90,40 @@ def test_overwrite_removes_only_selected_datafile(prepopulated_table: Table, dup
9090
9191 removed_files : List [DataFile ] = mt .deduplicate_data_files ()
9292
93- file_paths_after : Set [str ] = {df .file_path for df in mt ._get_all_datafiles ()}
94-
95- # Both files should remain, since they are not duplicates
96- assert str (dupe_data_file_path ) in file_paths_after , "Expected file_a.parquet to remain in the table"
97- assert len (removed_files ) == 0 , "Expected no files to be removed since there are no duplicates"
93+ file_names_after : Set [str ] = {df .file_path .split ("/" )[- 1 ] for df in mt ._get_all_datafiles ()}
94+ # Only one file with the same name should remain after deduplication
95+ assert dupe_data_file_path .name in file_names_after , f"Expected { dupe_data_file_path .name } to remain in the table"
96+ assert len (file_names_after ) == 1 , "Expected only one unique file name to remain after deduplication"
97+ # All removed files should have the same file name
98+ assert all (df .file_path .split ("/" )[- 1 ] == dupe_data_file_path .name for df in removed_files ), "All removed files should be duplicates by name"
9899
99100
100101def test_get_all_datafiles_current_snapshot (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
101102 mt = MaintenanceTable (tbl = prepopulated_table )
102103
103104 datafiles : List [DataFile ] = mt ._get_all_datafiles ()
104- file_paths : Set [str ] = {df .file_path for df in datafiles }
105- assert str ( dupe_data_file_path ) in file_paths
105+ file_paths : Set [str ] = {df .file_path . split ( "/" )[ - 1 ] for df in datafiles }
106+ assert dupe_data_file_path . name in file_paths
106107
107108
108109def test_get_all_datafiles_all_snapshots (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
109110 mt = MaintenanceTable (tbl = prepopulated_table )
110111
111112 datafiles : List [DataFile ] = mt ._get_all_datafiles ()
112- file_paths : Set [str ] = {df .file_path for df in datafiles }
113- assert str ( dupe_data_file_path ) in file_paths
113+ file_paths : Set [str ] = {df .file_path . split ( "/" )[ - 1 ] for df in datafiles }
114+ assert dupe_data_file_path . name in file_paths
114115
115116
116117def test_dedup_data_files_removes_duplicates_in_current_snapshot (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
117118 mt = MaintenanceTable (tbl = prepopulated_table )
118119
119120 all_datafiles : List [DataFile ] = mt ._get_all_datafiles ()
120- file_paths : List [str ] = [df .file_path for df in all_datafiles ]
121+ file_paths : List [str ] = [df .file_path . split ( "/" )[ - 1 ] for df in all_datafiles ]
121122 # Only one reference should remain after deduplication
122- assert file_paths .count (str ( dupe_data_file_path ) ) == 1
123+ assert file_paths .count (dupe_data_file_path . name ) == 1
123124 removed : List [DataFile ] = mt .deduplicate_data_files ()
124125
125126 all_datafiles_after : List [DataFile ] = mt ._get_all_datafiles ()
126- file_paths_after : List [str ] = [df .file_path for df in all_datafiles_after ]
127- assert file_paths_after .count (str ( dupe_data_file_path ) ) == 1
127+ file_paths_after : List [str ] = [df .file_path . split ( "/" )[ - 1 ] for df in all_datafiles_after ]
128+ assert file_paths_after .count (dupe_data_file_path . name ) == 1
128129 assert all (isinstance (df , DataFile ) for df in removed )
0 commit comments