Skip to content

Commit 803ac61

Browse files
committed
PR comments
1 parent 095fd76 commit 803ac61

2 files changed

Lines changed: 76 additions & 29 deletions

File tree

pyiceberg/expressions/visitors.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
from pyiceberg.schema import Schema
6060
from pyiceberg.typedef import EMPTY_DICT, L, LiteralValue, Record, StructProtocol
6161
from pyiceberg.types import (
62-
DateType,
6362
DoubleType,
6463
FloatType,
6564
IcebergType,
@@ -68,7 +67,6 @@
6867
NestedField,
6968
PrimitiveType,
7069
StructType,
71-
TimestampNanoType,
7270
TimestampType,
7371
TimestamptzType,
7472
)
@@ -78,13 +76,14 @@
7876

7977

8078
def _from_bytes_with_promotion(field_type: PrimitiveType, b: bytes) -> Any:
79+
# Integer, Float, Date are 4 bytes
80+
# Long, Double, Timestamps are 8 bytes
81+
# If we have 4 bytes, we may have to handle type promotion.
8182
if len(b) == 4:
8283
if isinstance(field_type, LongType):
8384
return from_bytes(IntegerType(), b)
8485
elif isinstance(field_type, DoubleType):
8586
return from_bytes(FloatType(), b)
86-
elif isinstance(field_type, (TimestampType, TimestampNanoType)):
87-
return from_bytes(DateType(), b)
8887
return from_bytes(field_type, b)
8988

9089

@@ -555,7 +554,7 @@ def visit_or(self, left_result: bool, right_result: bool) -> bool:
555554
def _from_byte_buffer(field_type: IcebergType, val: bytes) -> Any:
556555
if not isinstance(field_type, PrimitiveType):
557556
raise ValueError(f"Expected a PrimitiveType, got: {type(field_type)}")
558-
return from_bytes(field_type, val)
557+
return _from_bytes_with_promotion(field_type, val)
559558

560559

561560
class _ManifestEvalVisitor(BoundBooleanExpressionVisitor[bool]):

tests/expressions/test_evaluator.py

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,14 @@
4141
Or,
4242
StartsWith,
4343
)
44-
from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator
44+
from pyiceberg.expressions.visitors import (
45+
ROWS_CANNOT_MATCH,
46+
ROWS_MIGHT_MATCH,
47+
ROWS_MIGHT_NOT_MATCH,
48+
ROWS_MUST_MATCH,
49+
_InclusiveMetricsEvaluator,
50+
_StrictMetricsEvaluator,
51+
)
4552
from pyiceberg.manifest import DataFile, FileFormat
4653
from pyiceberg.schema import Schema
4754
from pyiceberg.typedef import Record
@@ -1466,44 +1473,85 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
14661473
assert not should_read, "Should not match: no_nulls field does not have bounds"
14671474

14681475

1469-
def test_inclusive_metrics_evaluator_with_type_promotion_crash() -> None:
1470-
# Schema defines 'id' as LongType (evolved state)
1471-
schema = Schema(NestedField(1, "id", LongType(), required=True))
1476+
@pytest.mark.parametrize(
1477+
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
1478+
[
1479+
# Int -> Long
1480+
(IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH),
1481+
(IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH),
1482+
# Float -> Double
1483+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH),
1484+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH),
1485+
],
1486+
)
1487+
def test_inclusive_metrics_evaluator_with_type_promotion(
1488+
file_type: PrimitiveType,
1489+
evolved_type: PrimitiveType,
1490+
lower_bound: Any,
1491+
upper_bound: Any,
1492+
op: Any,
1493+
lit: Any,
1494+
expected: bool,
1495+
) -> None:
1496+
# Schema defines 'col' with evolved state
1497+
schema = Schema(NestedField(1, "col", evolved_type, required=True))
14721498

1473-
# Historical manifest contains 4-byte integer bounds
1499+
# Historical manifest contains file_type bounds
14741500
data_file = DataFile.from_args(
14751501
file_path="file_1.parquet",
14761502
file_format=FileFormat.PARQUET,
14771503
partition={},
14781504
record_count=100,
14791505
file_size_in_bytes=1024,
1480-
lower_bounds={1: to_bytes(IntegerType(), 30)},
1481-
upper_bounds={1: to_bytes(IntegerType(), 79)},
1506+
lower_bounds={1: to_bytes(file_type, lower_bound)},
1507+
upper_bounds={1: to_bytes(file_type, upper_bound)},
14821508
)
14831509

1484-
# Predicate: id > 100
1485-
# Decodes 4-byte bounds correctly and skips the file
1486-
evaluator_pruning = _InclusiveMetricsEvaluator(schema, GreaterThan("id", 100))
1487-
assert not evaluator_pruning.eval(data_file)
1488-
1489-
1490-
def test_inclusive_metrics_evaluator_with_float_to_double_promotion() -> None:
1491-
# Schema defines 'val' as DoubleType (evolved state)
1492-
schema = Schema(NestedField(1, "val", DoubleType(), required=True))
1510+
# Predicate refers to 'col'
1511+
evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit))
1512+
assert evaluator.eval(data_file) == expected
1513+
1514+
1515+
@pytest.mark.parametrize(
1516+
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
1517+
[
1518+
# Int -> Long
1519+
(IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH),
1520+
(IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH),
1521+
(IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH),
1522+
(IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH),
1523+
# Float -> Double
1524+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH),
1525+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH),
1526+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH),
1527+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH),
1528+
],
1529+
)
1530+
def test_strict_metrics_evaluator_with_type_promotion(
1531+
file_type: PrimitiveType,
1532+
evolved_type: PrimitiveType,
1533+
lower_bound: Any,
1534+
upper_bound: Any,
1535+
op: Any,
1536+
lit: Any,
1537+
expected: bool,
1538+
) -> None:
1539+
# Schema defines 'col' with evolved state
1540+
schema = Schema(NestedField(1, "col", evolved_type, required=True))
14931541

1494-
# Historical manifest contains 4-byte float bounds
1542+
# Historical manifest contains file_type bounds
14951543
data_file = DataFile.from_args(
14961544
file_path="file_1.parquet",
14971545
file_format=FileFormat.PARQUET,
14981546
partition={},
14991547
record_count=100,
15001548
file_size_in_bytes=1024,
1501-
lower_bounds={1: to_bytes(FloatType(), 30.0)},
1502-
upper_bounds={1: to_bytes(FloatType(), 79.0)},
1549+
lower_bounds={1: to_bytes(file_type, lower_bound)},
1550+
upper_bounds={1: to_bytes(file_type, upper_bound)},
1551+
null_value_counts={1: 0},
1552+
nan_value_counts={1: 0},
15031553
)
15041554

1505-
# Predicate: val < 50.0
1506-
evaluator = _InclusiveMetricsEvaluator(schema, LessThan("val", 50.0))
1507-
1508-
# Should not crash and should correctly identify that the file might match
1509-
assert evaluator.eval(data_file)
1555+
# Predicate refers to 'col'
1556+
evaluator = _StrictMetricsEvaluator(schema, op("col", lit))
1557+
assert evaluator.eval(data_file) == expected

0 commit comments

Comments
 (0)