|
50 | 50 | BoundNotStartsWith, |
51 | 51 | BoundReference, |
52 | 52 | BoundStartsWith, |
| 53 | + EqualTo, |
53 | 54 | GreaterThan, |
| 55 | + IsNull, |
54 | 56 | Not, |
| 57 | + NotEqualTo, |
| 58 | + NotNull, |
55 | 59 | Or, |
56 | 60 | ) |
57 | 61 | from pyiceberg.expressions.literals import literal |
@@ -2317,3 +2321,66 @@ def test_pyarrow_io_multi_fs() -> None: |
2317 | 2321 |
|
2318 | 2322 | # Same PyArrowFileIO instance resolves local file input to LocalFileSystem |
2319 | 2323 | assert isinstance(pyarrow_file_io.new_input("file:///path/to/file")._filesystem, LocalFileSystem) |
| 2324 | + |
| 2325 | + |
| 2326 | +def test_scan_nulls(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: |
| 2327 | + import pyarrow.compute as pc |
| 2328 | + |
| 2329 | + catalog.create_namespace("default") |
| 2330 | + table = catalog.create_table( |
| 2331 | + "default.test_scan_nulls", |
| 2332 | + schema=arrow_table_with_null.schema, |
| 2333 | + ) |
| 2334 | + table.append(arrow_table_with_null) |
| 2335 | + |
| 2336 | + # "string": ["a", None, "z"] |
| 2337 | + assert len(table.scan(row_filter="string is null").to_arrow()) == 1 |
| 2338 | + assert len(table.scan(row_filter=IsNull("string")).to_arrow()) == 1 |
| 2339 | + assert len(table.scan().to_arrow().filter(pc.field("string").is_null())) == 1 |
| 2340 | + |
| 2341 | + assert len(table.scan(row_filter="string is not null").to_arrow()) == 2 |
| 2342 | + assert len(table.scan(row_filter=NotNull("string")).to_arrow()) == 2 |
| 2343 | + assert len(table.scan().to_arrow().filter(pc.field("string").is_valid())) == 2 |
| 2344 | + |
| 2345 | + assert len(table.scan(row_filter="string == 'a'").to_arrow()) == 1 |
| 2346 | + assert len(table.scan(row_filter=EqualTo(term="string", literal=("a"))).to_arrow()) == 1 |
| 2347 | + assert len(table.scan().to_arrow().filter(pc.field("string") == "a")) == 1 |
| 2348 | + |
| 2349 | + # this should be 2 |
| 2350 | + assert len(table.scan(row_filter="string != 'a'").to_arrow()) == 1 |
| 2351 | + assert len(table.scan(row_filter=NotEqualTo(term="string", literal=("a"))).to_arrow()) == 1 |
| 2352 | + assert len(table.scan(row_filter=Not(EqualTo(term="string", literal=("a")))).to_arrow()) == 1 |
| 2353 | + assert len(table.scan().to_arrow().filter(pc.field("string") != "a")) == 1 |
| 2354 | + |
| 2355 | + |
| 2356 | +def test_scan_kleene(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: |
| 2357 | + catalog.create_namespace("default") |
| 2358 | + table = catalog.create_table( |
| 2359 | + "default.test_scan_nulls", |
| 2360 | + schema=arrow_table_with_null.schema, |
| 2361 | + ) |
| 2362 | + table.append(arrow_table_with_null) |
| 2363 | + |
| 2364 | + # "string": ["a", None, "z"] |
| 2365 | + assert len(table.scan(row_filter="string is null OR string = 'a'").to_arrow()) == 2 # {null, a} |
| 2366 | + assert len(table.scan(row_filter="string is null AND string = 'a'").to_arrow()) == 0 # {} |
| 2367 | + assert len(table.scan(row_filter="string is not null OR string = 'a'").to_arrow()) == 2 # {a, z} |
| 2368 | + assert len(table.scan(row_filter="string is not null AND string = 'a'").to_arrow()) == 1 # {a} |
| 2369 | + |
| 2370 | + |
| 2371 | +def test_scan_complements(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: |
| 2372 | + from pyiceberg.expressions.visitors import bind |
| 2373 | + from pyiceberg.io.pyarrow import _expression_to_complementary_pyarrow |
| 2374 | + |
| 2375 | + catalog.create_namespace("default") |
| 2376 | + table = catalog.create_table( |
| 2377 | + "default.test_scan_complements", |
| 2378 | + schema=arrow_table_with_null.schema, |
| 2379 | + ) |
| 2380 | + table.append(arrow_table_with_null) |
| 2381 | + |
| 2382 | + string_equal = EqualTo(term="string", literal=("a")) |
| 2383 | + assert len(table.scan(row_filter=string_equal).to_arrow()) == 1 |
| 2384 | + bound_string_equal = bind(table.schema(), string_equal, case_sensitive=False) |
| 2385 | + filter_expression = _expression_to_complementary_pyarrow(bound_string_equal) |
| 2386 | + assert len(table.scan().to_arrow().filter(filter_expression)) == 2 # complements handles null correctly |
0 commit comments