Skip to content

Commit 2558dcd

Browse files
authored
Reading upper/lower bounds values with type promotions (#3293)
1 parent e6d5129 commit 2558dcd

2 files changed

Lines changed: 98 additions & 2 deletions

File tree

pyiceberg/conversions.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,6 @@ def _(_: PrimitiveType, b: bytes) -> int:
343343
return _INT_STRUCT.unpack(b)[0]
344344

345345

346-
@from_bytes.register(LongType)
347346
@from_bytes.register(TimeType)
348347
@from_bytes.register(TimestampType)
349348
@from_bytes.register(TimestamptzType)
@@ -353,13 +352,24 @@ def _(_: PrimitiveType, b: bytes) -> int:
353352
return _LONG_STRUCT.unpack(b)[0]
354353

355354

355+
@from_bytes.register(LongType)
356+
def _(_: PrimitiveType, b: bytes) -> int:
357+
if len(b) < 8:
358+
# If the length is 4 bytes, it is a promoted IntegerType
359+
return _INT_STRUCT.unpack(b)[0]
360+
return _LONG_STRUCT.unpack(b)[0]
361+
362+
356363
@from_bytes.register(FloatType)
357364
def _(_: FloatType, b: bytes) -> float:
358365
return _FLOAT_STRUCT.unpack(b)[0]
359366

360367

361368
@from_bytes.register(DoubleType)
362369
def _(_: DoubleType, b: bytes) -> float:
370+
if len(b) < 8:
371+
# If the length is 4 bytes, it is a promoted FloatType
372+
return _FLOAT_STRUCT.unpack(b)[0]
363373
return _DOUBLE_STRUCT.unpack(b)[0]
364374

365375

tests/expressions/test_evaluator.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,14 @@
4141
Or,
4242
StartsWith,
4343
)
44-
from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator
44+
from pyiceberg.expressions.visitors import (
45+
ROWS_CANNOT_MATCH,
46+
ROWS_MIGHT_MATCH,
47+
ROWS_MIGHT_NOT_MATCH,
48+
ROWS_MUST_MATCH,
49+
_InclusiveMetricsEvaluator,
50+
_StrictMetricsEvaluator,
51+
)
4552
from pyiceberg.manifest import DataFile, FileFormat
4653
from pyiceberg.schema import Schema
4754
from pyiceberg.typedef import Record
@@ -50,6 +57,7 @@
5057
FloatType,
5158
IcebergType,
5259
IntegerType,
60+
LongType,
5361
NestedField,
5462
PrimitiveType,
5563
StringType,
@@ -1463,3 +1471,81 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
14631471

14641472
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
14651473
assert not should_read, "Should not match: no_nulls field does not have bounds"
1474+
1475+
1476+
@pytest.mark.parametrize(
1477+
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
1478+
[
1479+
# Int -> Long
1480+
(IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH),
1481+
(IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH),
1482+
# Float -> Double
1483+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH),
1484+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH),
1485+
],
1486+
)
1487+
def test_inclusive_metrics_eval_bounds_after_promotion(
1488+
file_type: PrimitiveType,
1489+
evolved_type: PrimitiveType,
1490+
lower_bound: Any,
1491+
upper_bound: Any,
1492+
op: Any,
1493+
lit: Any,
1494+
expected: bool,
1495+
) -> None:
1496+
schema = Schema(NestedField(1, "col", evolved_type, required=True))
1497+
1498+
data_file = DataFile.from_args(
1499+
file_path="file_1.parquet",
1500+
file_format=FileFormat.PARQUET,
1501+
partition={},
1502+
record_count=100,
1503+
file_size_in_bytes=1024,
1504+
lower_bounds={1: to_bytes(file_type, lower_bound)},
1505+
upper_bounds={1: to_bytes(file_type, upper_bound)},
1506+
)
1507+
1508+
evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit))
1509+
assert evaluator.eval(data_file) == expected
1510+
1511+
1512+
@pytest.mark.parametrize(
1513+
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
1514+
[
1515+
# Int -> Long
1516+
(IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH),
1517+
(IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH),
1518+
(IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH),
1519+
(IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH),
1520+
# Float -> Double
1521+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH),
1522+
(FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH),
1523+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH),
1524+
(FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH),
1525+
],
1526+
)
1527+
def test_strict_metrics_eval_bounds_after_promotion(
1528+
file_type: PrimitiveType,
1529+
evolved_type: PrimitiveType,
1530+
lower_bound: Any,
1531+
upper_bound: Any,
1532+
op: Any,
1533+
lit: Any,
1534+
expected: bool,
1535+
) -> None:
1536+
schema = Schema(NestedField(1, "col", evolved_type, required=True))
1537+
1538+
data_file = DataFile.from_args(
1539+
file_path="file_1.parquet",
1540+
file_format=FileFormat.PARQUET,
1541+
partition={},
1542+
record_count=100,
1543+
file_size_in_bytes=1024,
1544+
lower_bounds={1: to_bytes(file_type, lower_bound)},
1545+
upper_bounds={1: to_bytes(file_type, upper_bound)},
1546+
null_value_counts={1: 0},
1547+
nan_value_counts={1: 0},
1548+
)
1549+
1550+
evaluator = _StrictMetricsEvaluator(schema, op("col", lit))
1551+
assert evaluator.eval(data_file) == expected

0 commit comments

Comments
 (0)