|
41 | 41 | Or, |
42 | 42 | StartsWith, |
43 | 43 | ) |
44 | | -from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator |
| 44 | +from pyiceberg.expressions.visitors import ( |
| 45 | + ROWS_CANNOT_MATCH, |
| 46 | + ROWS_MIGHT_MATCH, |
| 47 | + ROWS_MIGHT_NOT_MATCH, |
| 48 | + ROWS_MUST_MATCH, |
| 49 | + _InclusiveMetricsEvaluator, |
| 50 | + _StrictMetricsEvaluator, |
| 51 | +) |
45 | 52 | from pyiceberg.manifest import DataFile, FileFormat |
46 | 53 | from pyiceberg.schema import Schema |
47 | 54 | from pyiceberg.typedef import Record |
@@ -1466,44 +1473,85 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file |
1466 | 1473 | assert not should_read, "Should not match: no_nulls field does not have bounds" |
1467 | 1474 |
|
1468 | 1475 |
|
1469 | | -def test_inclusive_metrics_evaluator_with_type_promotion_crash() -> None: |
1470 | | - # Schema defines 'id' as LongType (evolved state) |
1471 | | - schema = Schema(NestedField(1, "id", LongType(), required=True)) |
| 1476 | +@pytest.mark.parametrize( |
| 1477 | + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", |
| 1478 | + [ |
| 1479 | + # Int -> Long |
| 1480 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH), |
| 1481 | + (IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH), |
| 1482 | + # Float -> Double |
| 1483 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH), |
| 1484 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH), |
| 1485 | + ], |
| 1486 | +) |
| 1487 | +def test_inclusive_metrics_evaluator_with_type_promotion( |
| 1488 | + file_type: PrimitiveType, |
| 1489 | + evolved_type: PrimitiveType, |
| 1490 | + lower_bound: Any, |
| 1491 | + upper_bound: Any, |
| 1492 | + op: Any, |
| 1493 | + lit: Any, |
| 1494 | + expected: bool, |
| 1495 | +) -> None: |
| 1496 | + # Schema defines 'col' with evolved state |
| 1497 | + schema = Schema(NestedField(1, "col", evolved_type, required=True)) |
1472 | 1498 |
|
1473 | | - # Historical manifest contains 4-byte integer bounds |
| 1499 | + # Historical manifest contains file_type bounds |
1474 | 1500 | data_file = DataFile.from_args( |
1475 | 1501 | file_path="file_1.parquet", |
1476 | 1502 | file_format=FileFormat.PARQUET, |
1477 | 1503 | partition={}, |
1478 | 1504 | record_count=100, |
1479 | 1505 | file_size_in_bytes=1024, |
1480 | | - lower_bounds={1: to_bytes(IntegerType(), 30)}, |
1481 | | - upper_bounds={1: to_bytes(IntegerType(), 79)}, |
| 1506 | + lower_bounds={1: to_bytes(file_type, lower_bound)}, |
| 1507 | + upper_bounds={1: to_bytes(file_type, upper_bound)}, |
1482 | 1508 | ) |
1483 | 1509 |
|
1484 | | - # Predicate: id > 100 |
1485 | | - # Decodes 4-byte bounds correctly and skips the file |
1486 | | - evaluator_pruning = _InclusiveMetricsEvaluator(schema, GreaterThan("id", 100)) |
1487 | | - assert not evaluator_pruning.eval(data_file) |
1488 | | - |
1489 | | - |
1490 | | -def test_inclusive_metrics_evaluator_with_float_to_double_promotion() -> None: |
1491 | | - # Schema defines 'val' as DoubleType (evolved state) |
1492 | | - schema = Schema(NestedField(1, "val", DoubleType(), required=True)) |
| 1510 | + # Predicate refers to 'col' |
| 1511 | + evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit)) |
| 1512 | + assert evaluator.eval(data_file) == expected |
| 1513 | + |
| 1514 | + |
| 1515 | +@pytest.mark.parametrize( |
| 1516 | + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", |
| 1517 | + [ |
| 1518 | + # Int -> Long |
| 1519 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH), |
| 1520 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH), |
| 1521 | + (IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH), |
| 1522 | + (IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH), |
| 1523 | + # Float -> Double |
| 1524 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH), |
| 1525 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH), |
| 1526 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH), |
| 1527 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH), |
| 1528 | + ], |
| 1529 | +) |
| 1530 | +def test_strict_metrics_evaluator_with_type_promotion( |
| 1531 | + file_type: PrimitiveType, |
| 1532 | + evolved_type: PrimitiveType, |
| 1533 | + lower_bound: Any, |
| 1534 | + upper_bound: Any, |
| 1535 | + op: Any, |
| 1536 | + lit: Any, |
| 1537 | + expected: bool, |
| 1538 | +) -> None: |
| 1539 | + # Schema defines 'col' with evolved state |
| 1540 | + schema = Schema(NestedField(1, "col", evolved_type, required=True)) |
1493 | 1541 |
|
1494 | | - # Historical manifest contains 4-byte float bounds |
| 1542 | + # Historical manifest contains file_type bounds |
1495 | 1543 | data_file = DataFile.from_args( |
1496 | 1544 | file_path="file_1.parquet", |
1497 | 1545 | file_format=FileFormat.PARQUET, |
1498 | 1546 | partition={}, |
1499 | 1547 | record_count=100, |
1500 | 1548 | file_size_in_bytes=1024, |
1501 | | - lower_bounds={1: to_bytes(FloatType(), 30.0)}, |
1502 | | - upper_bounds={1: to_bytes(FloatType(), 79.0)}, |
| 1549 | + lower_bounds={1: to_bytes(file_type, lower_bound)}, |
| 1550 | + upper_bounds={1: to_bytes(file_type, upper_bound)}, |
| 1551 | + null_value_counts={1: 0}, |
| 1552 | + nan_value_counts={1: 0}, |
1503 | 1553 | ) |
1504 | 1554 |
|
1505 | | - # Predicate: val < 50.0 |
1506 | | - evaluator = _InclusiveMetricsEvaluator(schema, LessThan("val", 50.0)) |
1507 | | - |
1508 | | - # Should not crash and should correctly identify that the file might match |
1509 | | - assert evaluator.eval(data_file) |
| 1555 | + # Predicate refers to 'col' |
| 1556 | + evaluator = _StrictMetricsEvaluator(schema, op("col", lit)) |
| 1557 | + assert evaluator.eval(data_file) == expected |
0 commit comments