From 1cacf9890778affa98a2ad608abd8396d909d30c Mon Sep 17 00:00:00 2001
From: Daniil Ivanik <61067749+divanik@users.noreply.github.com>
Date: Tue, 7 Apr 2026 14:53:33 +0000
Subject: [PATCH 1/2] Merge pull request #100645 from
 ClickHouse/divanik/add_bytes_and_rows_count_to_iceberg_data_object

Parse record_count and size_bytes fields from iceberg manifest file
---
 src/Core/ProtocolDefines.h                    |   3 +-
 .../Common/AvroForIcebergDeserializer.cpp     |  15 +-
 .../DataLakes/Iceberg/Constant.h              |   2 +
 .../Iceberg/IcebergDataObjectInfo.cpp         |  52 +++++-
 .../DataLakes/Iceberg/IcebergDataObjectInfo.h |   2 +
 .../DataLakes/Iceberg/ManifestFile.h          |  10 +-
 .../StorageObjectStorageSource.cpp            |  20 +++
 .../gtest_datalake_table_state_serde.cpp      |  96 +++++++++++
 .../test_file_stats_logging.py                | 159 ++++++++++++++++++
 9 files changed, 353 insertions(+), 6 deletions(-)
 create mode 100644 tests/integration/test_storage_iceberg_with_spark/test_file_stats_logging.py
diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h
index 45e6463a4225..ae7cff5d0e38 100644
--- a/src/Core/ProtocolDefines.h
+++ b/src/Core/ProtocolDefines.h
@@ -38,7 +38,8 @@ static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_ME
 static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_METADATA = 3;
 static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_FILE_BUCKETS_INFO = 4;
 static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_EXCLUDED_ROWS = 5;
-static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_EXCLUDED_ROWS;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_FILE_STATS = 6;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_FILE_STATS;
 
 static constexpr auto DATA_LAKE_TABLE_STATE_SNAPSHOT_PROTOCOL_VERSION = 1;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp
index c27bb2ac7117..fb295c895d50 100644
--- a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp
@@ -236,6 +236,9 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE
             sort_order_id = sort_order_id_value.safeGet<Int32>();
     }
 
+    const auto record_count = getValueFromRowByName(row_index, c_data_file_record_count, TypeIndex::Int64).safeGet<Int64>();
+    const auto file_size_in_bytes = getValueFromRowByName(row_index, c_data_file_file_size_in_bytes, TypeIndex::Int64).safeGet<Int64>();
+
     switch (content_type)
     {
         case FileContentType::DATA: {
@@ -253,7 +256,9 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE
                 /*lower_reference_data_file_path_ = */ std::nullopt,
                 /*upper_reference_data_file_path_ = */ std::nullopt,
                 /*equality_ids*/ std::nullopt,
-                sort_order_id);
+                sort_order_id,
+                record_count,
+                file_size_in_bytes);
         }
         case FileContentType::POSITION_DELETE: {
             /// reference_file_path can be absent in schema for some reason, though it is present in specification: https://iceberg.apache.org/spec/#manifests
@@ -296,7 +301,9 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE
                 lower_reference_data_file_path,
                 upper_reference_data_file_path,
                 /*equality_ids*/ std::nullopt,
-                /*sort_order_id = */ std::nullopt);
+                /*sort_order_id = */ std::nullopt,
+                record_count,
+                file_size_in_bytes);
         }
         case FileContentType::EQUALITY_DELETE: {
             std::vector<Int32> equality_ids;
@@ -325,7 +332,9 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE
                 /*lower_reference_data_file_path_ = */ std::nullopt,
                 /*upper_reference_data_file_path_ = */ std::nullopt,
                 equality_ids,
-                /*sort_order_id = */ std::nullopt);
+                /*sort_order_id = */ std::nullopt,
+                record_count,
+                file_size_in_bytes);
         }
     }
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Constant.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Constant.h
index 6f1e455e351a..23f205736a5d 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Constant.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Constant.h
@@ -149,6 +149,8 @@ DEFINE_ICEBERG_FIELD_COMPOUND(data_file, lower_bounds);
 DEFINE_ICEBERG_FIELD_COMPOUND(data_file, upper_bounds);
 DEFINE_ICEBERG_FIELD_COMPOUND(data_file, referenced_data_file);
 DEFINE_ICEBERG_FIELD_COMPOUND(data_file, sort_order_id);
+DEFINE_ICEBERG_FIELD_COMPOUND(data_file, record_count);
+DEFINE_ICEBERG_FIELD_COMPOUND(data_file, file_size_in_bytes);
 
 /// Fallback defaults for snapshot retention policy when table properties are absent.
 /// These values follow the Java reference implementation; the Iceberg spec does not
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp
index 9a963feb3d3f..7046483523d5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp
@@ -43,7 +43,9 @@ IcebergDataObjectInfo::IcebergDataObjectInfo(
           data_manifest_file_entry_->sequence_number,
           data_manifest_file_entry_->parsed_entry->file_format,
           /* position_deletes_objects */ {},
-          /* equality_deletes_objects */ {}}
+          /* equality_deletes_objects */ {},
+          data_manifest_file_entry_->parsed_entry->record_count,
+          data_manifest_file_entry_->parsed_entry->file_size_in_bytes}
 {
 }
 
@@ -137,6 +139,27 @@ void IcebergObjectSerializableInfo::serializeForClusterFunctionProtocol(WriteBuf
             }
         }
     }
+    if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_FILE_STATS)
+    {
+        if (record_count.has_value())
+        {
+            writeVarUInt(1, out);
+            writeVarInt(*record_count, out);
+        }
+        else
+        {
+            writeVarUInt(0, out);
+        }
+        if (file_size_in_bytes.has_value())
+        {
+            writeVarUInt(1, out);
+            writeVarInt(*file_size_in_bytes, out);
+        }
+        else
+        {
+            writeVarUInt(0, out);
+        }
+    }
 }
 
 void IcebergObjectSerializableInfo::deserializeForClusterFunctionProtocol(ReadBuffer & in, size_t protocol_version)
@@ -193,6 +216,33 @@ void IcebergObjectSerializableInfo::deserializeForClusterFunctionProtocol(ReadBu
             }
         }
     }
+    if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_FILE_STATS)
+    {
+        size_t has_record_count = 0;
+        readVarUInt(has_record_count, in);
+        if (has_record_count)
+        {
+            Int64 value = 0;
+            readVarInt(value, in);
+            record_count = value;
+        }
+        else
+        {
+            record_count = std::nullopt;
+        }
+        size_t has_file_size = 0;
+        readVarUInt(has_file_size, in);
+        if (has_file_size)
+        {
+            Int64 value = 0;
+            readVarInt(value, in);
+            file_size_in_bytes = value;
+        }
+        else
+        {
+            file_size_in_bytes = std::nullopt;
+        }
+    }
 }
 
 void IcebergObjectSerializableInfo::checkVersion(size_t protocol_version) const
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h
index ce8e551d6e32..75c1bb9fba3d 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h
@@ -22,6 +22,8 @@ struct IcebergObjectSerializableInfo
     String file_format;
     std::vector<Iceberg::PositionDeleteObject> position_deletes_objects;
     std::vector<Iceberg::EqualityDeleteObject> equality_deletes_objects;
+    std::optional<Int64> record_count;
+    std::optional<Int64> file_size_in_bytes;
 
     void serializeForClusterFunctionProtocol(WriteBuffer & out, size_t protocol_version) const;
     void deserializeForClusterFunctionProtocol(ReadBuffer & in, size_t protocol_version);
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 3a1bccc53411..3a1b100a5e66 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -96,6 +96,10 @@ struct ParsedManifestFileEntry : boost::noncopyable
     /// Data file is sorted with this sort_order_id (can be read from metadata.json)
     std::optional<Int32> sort_order_id;
 
+    /// File-level statistics from Iceberg manifest (required fields per spec)
+    Int64 record_count;
+    Int64 file_size_in_bytes;
+
     ParsedManifestFileEntry(
         FileContentType content_type_,
         String file_path_key_,
@@ -110,7 +114,9 @@ struct ParsedManifestFileEntry : boost::noncopyable
         std::optional<String> lower_reference_data_file_path_,
         std::optional<String> upper_reference_data_file_path_,
         std::optional<std::vector<Int32>> equality_ids_,
-        std::optional<Int32> sort_order_id_)
+        std::optional<Int32> sort_order_id_,
+        Int64 record_count_,
+        Int64 file_size_in_bytes_)
         : content_type(content_type_)
         , file_path_key(std::move(file_path_key_))
         , row_number(row_number_)
@@ -125,6 +131,8 @@ struct ParsedManifestFileEntry : boost::noncopyable
         , upper_reference_data_file_path(std::move(upper_reference_data_file_path_))
         , equality_ids(std::move(equality_ids_))
         , sort_order_id(sort_order_id_)
+        , record_count(record_count_)
+        , file_size_in_bytes(file_size_in_bytes_)
     {
     }
 };
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index a6567a237634..374795064495 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -32,6 +32,7 @@
 #include <Storages/HivePartitioningUtils.h>
 #include <Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h>
 #include <Storages/ObjectStorage/DataLakes/DeletionVectorTransform.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
 #include <Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h>
@@ -97,6 +98,23 @@ namespace ErrorCodes
     extern const int FILE_DOESNT_EXIST;
 }
 
+void logIcebergFileStats(const ObjectInfoPtr & object_info, const LoggerPtr & log)
+{
+#if USE_AVRO
+    if (auto iceberg_object = std::dynamic_pointer_cast<IcebergDataObjectInfo>(object_info))
+    {
+        const auto & info = iceberg_object->info;
+        if (info.record_count.has_value())
+            LOG_TEST(log, "Iceberg record_count for '{}': {}", object_info->getPath(), *info.record_count);
+        if (info.file_size_in_bytes.has_value())
+            LOG_TEST(log, "Iceberg file_size_in_bytes for '{}': {}", object_info->getPath(), *info.file_size_in_bytes);
+    }
+#else
+    UNUSED(object_info);
+    UNUSED(log);
+#endif
+}
+
 StorageObjectStorageSource::StorageObjectStorageSource(
     String name_,
     ObjectStoragePtr object_storage_,
@@ -878,6 +896,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
             object_info->getObjectMetadata()->size_bytes,
             object_info->getFileFormat().value_or(configuration->getFormat()));
 
+        logIcebergFileStats(object_info, log);
+
         bool use_native_reader_v3 = format_settings.has_value()
             ? format_settings->parquet.use_native_reader_v3
             : context_->getSettingsRef()[Setting::input_format_parquet_use_native_reader_v3];
diff --git a/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp b/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
index 1b5e3aa2c233..cf5b8befdaf2 100644
--- a/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
+++ b/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
@@ -2,7 +2,9 @@
 #include <IO/WriteBufferFromString.h>
 #include <gtest/gtest.h>
 #include <Storages/ObjectStorage/DataLakes/DataLakeTableStateSnapshot.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergTableStateSnapshot.h>
+#include <Core/ProtocolDefines.h>
 
 using namespace DB;
 
@@ -60,3 +62,97 @@ TEST(DatalakeStateSerde, DataLakeStateSerde)
     }
 }
 
+
+TEST(DatalakeStateSerde, IcebergObjectSerializableInfoRoundTrip)
+{
+    Iceberg::IcebergObjectSerializableInfo info;
+    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.underlying_format_read_schema_id = 42;
+    info.schema_id_relevant_to_iterator = 7;
+    info.sequence_number = 123456;
+    info.file_format = "PARQUET";
+    info.position_deletes_objects = {{"s3://bucket/deletes/pos1.parquet", "PARQUET", "s3://bucket/path/to/file.parquet"}};
+    info.equality_deletes_objects = {{"s3://bucket/deletes/eq1.parquet", "PARQUET", std::vector<Int32>{1, 2, 3}, 42}};
+    info.record_count = 100500;
+    info.file_size_in_bytes = 999888777;
+
+    String str;
+    {
+        WriteBufferFromString write_buffer{str};
+        info.serializeForClusterFunctionProtocol(write_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION);
+    }
+    ReadBufferFromMemory read_buffer(str.data(), str.size());
+    Iceberg::IcebergObjectSerializableInfo deserialized;
+    deserialized.deserializeForClusterFunctionProtocol(read_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION);
+
+    ASSERT_TRUE(read_buffer.eof());
+    ASSERT_EQ(deserialized.data_object_file_path_key, info.data_object_file_path_key);
+    ASSERT_EQ(deserialized.underlying_format_read_schema_id, info.underlying_format_read_schema_id);
+    ASSERT_EQ(deserialized.schema_id_relevant_to_iterator, info.schema_id_relevant_to_iterator);
+    ASSERT_EQ(deserialized.sequence_number, info.sequence_number);
+    ASSERT_EQ(deserialized.file_format, info.file_format);
+    ASSERT_EQ(deserialized.position_deletes_objects.size(), 1);
+    ASSERT_EQ(deserialized.equality_deletes_objects.size(), 1);
+    ASSERT_TRUE(deserialized.record_count.has_value());
+    ASSERT_EQ(*deserialized.record_count, 100500);
+    ASSERT_TRUE(deserialized.file_size_in_bytes.has_value());
+    ASSERT_EQ(*deserialized.file_size_in_bytes, 999888777);
+}
+
+
+TEST(DatalakeStateSerde, IcebergObjectSerializableInfoWithoutFileStats)
+{
+    Iceberg::IcebergObjectSerializableInfo info;
+    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.underlying_format_read_schema_id = 1;
+    info.schema_id_relevant_to_iterator = 1;
+    info.sequence_number = 0;
+    info.file_format = "PARQUET";
+    info.record_count = 42;
+    info.file_size_in_bytes = 1024;
+
+    /// Serialize at protocol version 5 (before file stats were added)
+    String str;
+    {
+        WriteBufferFromString write_buffer{str};
+        info.serializeForClusterFunctionProtocol(write_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_EXCLUDED_ROWS);
+    }
+
+    /// Deserialize at protocol version 5 — the new fields should be absent
+    ReadBufferFromMemory read_buffer(str.data(), str.size());
+    Iceberg::IcebergObjectSerializableInfo deserialized;
+    deserialized.deserializeForClusterFunctionProtocol(read_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_EXCLUDED_ROWS);
+
+    ASSERT_TRUE(read_buffer.eof());
+    ASSERT_EQ(deserialized.data_object_file_path_key, info.data_object_file_path_key);
+    ASSERT_FALSE(deserialized.record_count.has_value());
+    ASSERT_FALSE(deserialized.file_size_in_bytes.has_value());
+}
+
+
+TEST(DatalakeStateSerde, IcebergObjectSerializableInfoNulloptFileStats)
+{
+    /// Test round-trip when the fields are explicitly nullopt
+    Iceberg::IcebergObjectSerializableInfo info;
+    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.underlying_format_read_schema_id = 1;
+    info.schema_id_relevant_to_iterator = 1;
+    info.sequence_number = 0;
+    info.file_format = "PARQUET";
+    info.record_count = std::nullopt;
+    info.file_size_in_bytes = std::nullopt;
+
+    String str;
+    {
+        WriteBufferFromString write_buffer{str};
+        info.serializeForClusterFunctionProtocol(write_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION);
+    }
+    ReadBufferFromMemory read_buffer(str.data(), str.size());
+    Iceberg::IcebergObjectSerializableInfo deserialized;
+    deserialized.deserializeForClusterFunctionProtocol(read_buffer, DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION);
+
+    ASSERT_TRUE(read_buffer.eof());
+    ASSERT_FALSE(deserialized.record_count.has_value());
+    ASSERT_FALSE(deserialized.file_size_in_bytes.has_value());
+}
+
diff --git a/tests/integration/test_storage_iceberg_with_spark/test_file_stats_logging.py b/tests/integration/test_storage_iceberg_with_spark/test_file_stats_logging.py
new file mode 100644
index 000000000000..742560fc4716
--- /dev/null
+++ b/tests/integration/test_storage_iceberg_with_spark/test_file_stats_logging.py
@@ -0,0 +1,159 @@
+import logging
+import pytest
+import re
+
+from helpers.iceberg_utils import (
+    default_upload_directory,
+    write_iceberg_from_df,
+    generate_data,
+    get_creation_expression,
+    get_uuid_str,
+)
+
+from helpers.config_cluster import minio_secret_key
+
+
+def parse_logged_file_stats(instance, query_ids):
+    """
+    Parse Iceberg file stats from server log, filtering by any of the given query_ids.
+    Returns dict: {file_path: {"record_count": int, "file_size_in_bytes": int}}
+    """
+    stats = {}
+
+    record_count_logs = instance.grep_in_log("Iceberg record_count for")
+    for log_line in record_count_logs.splitlines():
+        if not any(qid in log_line for qid in query_ids):
+            continue
+        match = re.search(r"Iceberg record_count for '([^']+)': (\d+)", log_line)
+        if match:
+            path, count = match.group(1), int(match.group(2))
+            stats.setdefault(path, {})["record_count"] = count
+
+    file_size_logs = instance.grep_in_log("Iceberg file_size_in_bytes for")
+    for log_line in file_size_logs.splitlines():
+        if not any(qid in log_line for qid in query_ids):
+            continue
+        match = re.search(
+            r"Iceberg file_size_in_bytes for '([^']+)': (\d+)", log_line
+        )
+        if match:
+            path, size = match.group(1), int(match.group(2))
+            stats.setdefault(path, {})["file_size_in_bytes"] = size
+
+    return stats
+
+
+@pytest.mark.parametrize("run_on_cluster", [False, True])
+@pytest.mark.parametrize("format_version", ["1", "2"])
+@pytest.mark.parametrize("storage_type", ["s3"])
+def test_iceberg_file_stats_logging(
+    started_cluster_iceberg_with_spark, format_version, storage_type, run_on_cluster
+):
+    """
+    Verify that record_count and file_size_in_bytes parsed from Iceberg manifest
+    match actual file row counts and sizes in object storage.
+    When run_on_cluster=True, also verifies the values survive cluster function
+    protocol serialization.
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    bucket = started_cluster_iceberg_with_spark.minio_bucket
+
+    cluster_suffix = "_cluster" if run_on_cluster else ""
+    TABLE_NAME = (
+        "test_file_stats_logging_"
+        + format_version
+        + "_"
+        + storage_type
+        + cluster_suffix
+        + "_"
+        + get_uuid_str()
+    )
+
+    NUM_ROWS = 100
+
+    write_iceberg_from_df(
+        spark,
+        generate_data(spark, 0, NUM_ROWS),
+        TABLE_NAME,
+        mode="overwrite",
+        format_version=format_version,
+    )
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    table_function_expr = get_creation_expression(
+        storage_type,
+        TABLE_NAME,
+        started_cluster_iceberg_with_spark,
+        table_function=True,
+        run_on_cluster=run_on_cluster,
+    )
+
+    query_id = TABLE_NAME + "_query_id"
+    result = instance.query(
+        f"SELECT * FROM {table_function_expr}", query_id=query_id
+    )
+    assert len(result.strip().split("\n")) == NUM_ROWS
+
+    # Collect all query_ids related to this query across all nodes
+    # (in cluster mode, worker nodes get their own query_id but share initial_query_id)
+    all_query_ids = {query_id}
+    for node_name, node_instance in started_cluster_iceberg_with_spark.instances.items():
+        node_instance.query("SYSTEM FLUSH LOGS")
+        related_ids = node_instance.query(
+            f"SELECT query_id FROM system.query_log WHERE initial_query_id = '{query_id}'"
+        ).strip()
+        for qid in related_ids.splitlines():
+            if qid:
+                all_query_ids.add(qid)
+    logging.info(f"All related query_ids: {all_query_ids}")
+
+    # Collect file stats from all nodes, filtered by any related query_id
+    all_stats = {}
+    for node_name, node_instance in started_cluster_iceberg_with_spark.instances.items():
+        node_stats = parse_logged_file_stats(node_instance, all_query_ids)
+        logging.info(f"[{node_name}] Parsed file stats from logs: {node_stats}")
+        all_stats.update(node_stats)
+
+    assert len(all_stats) > 0, "Expected at least one file with logged stats"
+
+    for file_path, file_stats in all_stats.items():
+        assert "record_count" in file_stats, f"Missing record_count for {file_path}"
+        assert (
+            "file_size_in_bytes" in file_stats
+        ), f"Missing file_size_in_bytes for {file_path}"
+
+        logged_record_count = file_stats["record_count"]
+        logged_file_size = file_stats["file_size_in_bytes"]
+
+        # Verify record_count matches actual row count by reading the parquet file directly
+        actual_row_count = int(
+            instance.query(
+                f"SELECT count() FROM s3('http://minio1:9001/{bucket}/{file_path}', 'minio', '{minio_secret_key}', 'Parquet')"
+            ).strip()
+        )
+        assert logged_record_count == actual_row_count, (
+            f"record_count mismatch for {file_path}: "
+            f"logged={logged_record_count}, actual={actual_row_count}"
+        )
+
+        # Verify file_size_in_bytes matches actual object size in S3
+        actual_size = started_cluster_iceberg_with_spark.minio_client.stat_object(
+            bucket, file_path
+        ).size
+        assert logged_file_size == actual_size, (
+            f"file_size_in_bytes mismatch for {file_path}: "
+            f"logged={logged_file_size}, actual={actual_size}"
+        )
+
+    # Verify total record_count sums to at least NUM_ROWS
+    total_record_count = sum(s.get("record_count", 0) for s in all_stats.values())
+    assert total_record_count >= NUM_ROWS, (
+        f"Total record_count ({total_record_count}) < expected ({NUM_ROWS})"
+    )

From 3cb64e52ee67212a5319b9d529e7172317c75434 Mon Sep 17 00:00:00 2001
From: Mikhail Koviazin <github@mkmk.aleeas.com>
Date: Sat, 9 May 2026 14:51:31 +0200
Subject: [PATCH 2/2] fix gtest to use string instead of IcebergPath

---
 .../tests/gtest_datalake_table_state_serde.cpp              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp b/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
index cf5b8befdaf2..7b962a491af1 100644
--- a/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
+++ b/src/Storages/ObjectStorage/tests/gtest_datalake_table_state_serde.cpp
@@ -66,7 +66,7 @@ TEST(DatalakeStateSerde, DataLakeStateSerde)
 TEST(DatalakeStateSerde, IcebergObjectSerializableInfoRoundTrip)
 {
     Iceberg::IcebergObjectSerializableInfo info;
-    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.data_object_file_path_key = "s3://bucket/path/to/file.parquet";
     info.underlying_format_read_schema_id = 42;
     info.schema_id_relevant_to_iterator = 7;
     info.sequence_number = 123456;
@@ -103,7 +103,7 @@ TEST(DatalakeStateSerde, IcebergObjectSerializableInfoRoundTrip)
 TEST(DatalakeStateSerde, IcebergObjectSerializableInfoWithoutFileStats)
 {
     Iceberg::IcebergObjectSerializableInfo info;
-    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.data_object_file_path_key = "s3://bucket/path/to/file.parquet";
     info.underlying_format_read_schema_id = 1;
     info.schema_id_relevant_to_iterator = 1;
     info.sequence_number = 0;
@@ -134,7 +134,7 @@ TEST(DatalakeStateSerde, IcebergObjectSerializableInfoNulloptFileStats)
 {
     /// Test round-trip when the fields are explicitly nullopt
     Iceberg::IcebergObjectSerializableInfo info;
-    info.data_object_file_path_key = DB::Iceberg::IcebergPathFromMetadata::deserialize("s3://bucket/path/to/file.parquet");
+    info.data_object_file_path_key = "s3://bucket/path/to/file.parquet";
     info.underlying_format_read_schema_id = 1;
     info.schema_id_relevant_to_iterator = 1;
     info.sequence_number = 0;