|
6 | 6 | from uuid import UUID |
7 | 7 |
|
8 | 8 | from metadata.generated.schema.api.data.createTable import CreateTableRequest |
| 9 | +from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest |
9 | 10 | from metadata.generated.schema.entity.data.pipeline import Pipeline, Task |
10 | 11 | from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( |
11 | 12 | OpenMetadataConnection, |
@@ -597,6 +598,187 @@ def test_get_create_table_request(self, mock_get_schema_fqn, mock_get_table_fqn) |
597 | 598 | create_request.columns[i].dataTypeDisplay, expected_type_display |
598 | 599 | ) |
599 | 600 |
|
| 601 | + @patch("confluent_kafka.Consumer") |
| 602 | + def test_get_pipelines_list_filters_complete_events(self, mock_consumer_class): |
| 603 | + """Test that get_pipelines_list returns COMPLETE events""" |
| 604 | + event = copy.deepcopy(VALID_EVENT) |
| 605 | + event["eventType"] = "COMPLETE" |
| 606 | + self.setup_mock_consumer_with_kafka_event(event) |
| 607 | + |
| 608 | + result_generator = self.open_lineage_source.get_pipelines_list() |
| 609 | + results = list(result_generator) |
| 610 | + |
| 611 | + self.assertEqual(len(results), 1) |
| 612 | + self.assertIsInstance(results[0], OpenLineageEvent) |
| 613 | + self.assertEqual(results[0].event_type, "COMPLETE") |
| 614 | + |
| 615 | + @patch("confluent_kafka.Consumer") |
| 616 | + def test_get_pipelines_list_filters_running_events(self, mock_consumer_class): |
| 617 | + """Test that get_pipelines_list returns RUNNING events""" |
| 618 | + event = copy.deepcopy(VALID_EVENT) |
| 619 | + event["eventType"] = "RUNNING" |
| 620 | + self.setup_mock_consumer_with_kafka_event(event) |
| 621 | + |
| 622 | + result_generator = self.open_lineage_source.get_pipelines_list() |
| 623 | + results = list(result_generator) |
| 624 | + |
| 625 | + self.assertEqual(len(results), 1) |
| 626 | + self.assertIsInstance(results[0], OpenLineageEvent) |
| 627 | + self.assertEqual(results[0].event_type, "RUNNING") |
| 628 | + |
| 629 | + @patch("confluent_kafka.Consumer") |
| 630 | + def test_get_pipelines_list_filters_start_events(self, mock_consumer_class): |
| 631 | + """Test that get_pipelines_list returns START events""" |
| 632 | + event = copy.deepcopy(VALID_EVENT) |
| 633 | + event["eventType"] = "START" |
| 634 | + self.setup_mock_consumer_with_kafka_event(event) |
| 635 | + |
| 636 | + result_generator = self.open_lineage_source.get_pipelines_list() |
| 637 | + results = list(result_generator) |
| 638 | + |
| 639 | + self.assertEqual(len(results), 1) |
| 640 | + self.assertIsInstance(results[0], OpenLineageEvent) |
| 641 | + self.assertEqual(results[0].event_type, "START") |
| 642 | + |
| 643 | + @patch("confluent_kafka.Consumer") |
| 644 | + def test_get_pipelines_list_filters_out_fail_events(self, mock_consumer_class): |
| 645 | + """Test that get_pipelines_list filters out FAIL events""" |
| 646 | + event = copy.deepcopy(VALID_EVENT) |
| 647 | + event["eventType"] = "FAIL" |
| 648 | + self.setup_mock_consumer_with_kafka_event(event) |
| 649 | + |
| 650 | + result_generator = self.open_lineage_source.get_pipelines_list() |
| 651 | + results = list(result_generator) |
| 652 | + |
| 653 | + self.assertEqual(len(results), 0) |
| 654 | + |
| 655 | + @patch("confluent_kafka.Consumer") |
| 656 | + def test_get_pipelines_list_filters_out_abort_events(self, mock_consumer_class): |
| 657 | + """Test that get_pipelines_list filters out ABORT events""" |
| 658 | + event = copy.deepcopy(VALID_EVENT) |
| 659 | + event["eventType"] = "ABORT" |
| 660 | + self.setup_mock_consumer_with_kafka_event(event) |
| 661 | + |
| 662 | + result_generator = self.open_lineage_source.get_pipelines_list() |
| 663 | + results = list(result_generator) |
| 664 | + |
| 665 | + self.assertEqual(len(results), 0) |
| 666 | + |
| 667 | + @patch( |
| 668 | + "metadata.ingestion.source.pipeline.openlineage.metadata.OpenlineageSource._get_table_fqn_from_om" |
| 669 | + ) |
| 670 | + def test_lineage_merge_start_with_data_running_without(self, mock_get_table_fqn): |
| 671 | + """ |
| 672 | + Test that START event with lineage data followed by RUNNING event without |
| 673 | + lineage data does not overwrite existing lineage in the database. |
| 674 | +
|
| 675 | + This simulates Flink streaming jobs where: |
| 676 | + - START event contains initial lineage |
| 677 | + - RUNNING events are heartbeats with no/empty lineage |
| 678 | +
|
| 679 | + The test verifies the complete flow: |
| 680 | + 1. START event creates lineage with column details |
| 681 | + 2. RUNNING event with empty data is processed |
| 682 | + 3. Query back the lineage - it should still have the original data |
| 683 | + """ |
| 684 | + # Create START event with lineage data |
| 685 | + start_event = copy.deepcopy(FULL_OL_KAFKA_EVENT) |
| 686 | + start_event["eventType"] = "START" |
| 687 | + |
| 688 | + # Create RUNNING event with same job but no lineage (empty inputs/outputs) |
| 689 | + running_event = copy.deepcopy(FULL_OL_KAFKA_EVENT) |
| 690 | + running_event["eventType"] = "RUNNING" |
| 691 | + running_event["inputs"] = [] |
| 692 | + running_event["outputs"] = [] |
| 693 | + |
| 694 | + # Mock table FQN lookup |
| 695 | + def mock_fqn_side_effect(table_details): |
| 696 | + return f"testService.shopify.{table_details.name}" |
| 697 | + |
| 698 | + mock_get_table_fqn.side_effect = mock_fqn_side_effect |
| 699 | + |
| 700 | + # Mock metadata.get_by_name for table lookups |
| 701 | + from_table_id = "69fc8906-4a4a-45ab-9a54-9cc2d399e10e" |
| 702 | + to_table_id = "59fc8906-4a4a-45ab-9a54-9cc2d399e10e" |
| 703 | + |
| 704 | + def mock_get_uuid_by_name(entity, fqn): |
| 705 | + if fqn == "testService.shopify.raw_product_catalog": |
| 706 | + return Mock(id=from_table_id) |
| 707 | + elif fqn == "testService.shopify.fact_order_new5": |
| 708 | + return Mock(id=to_table_id) |
| 709 | + elif "openlineage_source" in fqn: # Pipeline entity |
| 710 | + return Mock(id=Mock(root="79fc8906-4a4a-45ab-9a54-9cc2d399e10e")) |
| 711 | + return None |
| 712 | + |
| 713 | + # Process START event with lineage |
| 714 | + start_ol_event = message_to_open_lineage_event(start_event) |
| 715 | + with patch.object( |
| 716 | + OpenMetadataConnection, |
| 717 | + "get_by_name", |
| 718 | + create=True, |
| 719 | + side_effect=mock_get_uuid_by_name, |
| 720 | + ): |
| 721 | + start_lineage_results = list( |
| 722 | + self.open_lineage_source.yield_pipeline_lineage_details(start_ol_event) |
| 723 | + ) |
| 724 | + |
| 725 | + # Process RUNNING event without lineage |
| 726 | + running_ol_event = message_to_open_lineage_event(running_event) |
| 727 | + with patch.object( |
| 728 | + OpenMetadataConnection, |
| 729 | + "get_by_name", |
| 730 | + create=True, |
| 731 | + side_effect=mock_get_uuid_by_name, |
| 732 | + ): |
| 733 | + running_lineage_results = list( |
| 734 | + self.open_lineage_source.yield_pipeline_lineage_details( |
| 735 | + running_ol_event |
| 736 | + ) |
| 737 | + ) |
| 738 | + |
| 739 | + # Extract lineage requests from START event |
| 740 | + start_lineage_requests = [ |
| 741 | + r.right |
| 742 | + for r in start_lineage_results |
| 743 | + if r.right and isinstance(r.right, AddLineageRequest) |
| 744 | + ] |
| 745 | + |
| 746 | + # Extract lineage requests from RUNNING event |
| 747 | + running_lineage_requests = [ |
| 748 | + r.right |
| 749 | + for r in running_lineage_results |
| 750 | + if r.right and isinstance(r.right, AddLineageRequest) |
| 751 | + ] |
| 752 | + |
| 753 | + # Verify START event produced lineage with column details |
| 754 | + start_requests_with_columns = [ |
| 755 | + req |
| 756 | + for req in start_lineage_requests |
| 757 | + if req.edge.lineageDetails and req.edge.lineageDetails.columnsLineage |
| 758 | + ] |
| 759 | + self.assertGreater( |
| 760 | + len(start_requests_with_columns), |
| 761 | + 0, |
| 762 | + "START event should produce lineage requests with column details", |
| 763 | + ) |
| 764 | + |
| 765 | + # Count column lineage entries from START |
| 766 | + start_column_count = sum( |
| 767 | + len(req.edge.lineageDetails.columnsLineage) |
| 768 | + for req in start_requests_with_columns |
| 769 | + ) |
| 770 | + self.assertGreater( |
| 771 | + start_column_count, 0, "START event should have column lineage" |
| 772 | + ) |
| 773 | + |
| 774 | + # Key assertion: RUNNING event with empty inputs/outputs produces no lineage requests |
| 775 | + # This prevents empty data from being sent to the database |
| 776 | + self.assertEqual( |
| 777 | + len(running_lineage_requests), |
| 778 | + 0, |
| 779 | + "RUNNING event with empty inputs/outputs should not produce any lineage requests", |
| 780 | + ) |
| 781 | + |
600 | 782 |
|
601 | 783 | if __name__ == "__main__": |
602 | 784 | unittest.main() |
0 commit comments