From 652c0902192ce5dbf130cc5d59dbdb82a027f1f0 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:37:49 -0700 Subject: [PATCH 01/25] chore: update test pipeline to run higher spark version --- .github/workflows/tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4f7b054a0..f3cc345c1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,12 +127,12 @@ jobs: strategy: matrix: os: [ ubuntu-22.04 ] - python-version: [3.8] + python-version: ["3.9", "3.10", "3.11"] pandas: ["pandas>1.1"] - spark: ["3.0.1"] - hadoop: [ 2.7 ] + spark: ["3.4.4", "3.5.5"] + hadoop: [ 3.7 ] numpy: ["numpy"] - java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ] + java_home: [ /usr/lib/jvm/java-11-openjdk-amd64 ] name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} env: @@ -172,7 +172,7 @@ jobs: ${{ runner.os }}-${{ matrix.pandas }}-pip- - run: | pip install --upgrade pip setuptools wheel - pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}" + pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" pip install ".[test]" pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" - if: ${{ matrix.spark != '3.0.1' }} From 5e969267c49c5d50b5620be628e4e31c6fc830e5 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:46:55 -0700 Subject: [PATCH 02/25] chore: update spark test CI/CD pipeline. --- .github/workflows/tests.yml | 73 ++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f3cc345c1..98bac977f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -123,63 +123,70 @@ jobs: test_spark: runs-on: ${{ matrix.os }} - continue-on-error: True + continue-on-error: true strategy: matrix: os: [ ubuntu-22.04 ] - python-version: ["3.9", "3.10", "3.11"] - pandas: ["pandas>1.1"] - spark: ["3.4.4", "3.5.5"] - hadoop: [ 3.7 ] - numpy: ["numpy"] - java_home: [ /usr/lib/jvm/java-11-openjdk-amd64 ] + python-version: [ "3.9", "3.10", "3.11" ] + pandas: [ "pandas>1.1" ] + spark: [ "3.4.4", "3.5.0" ] + hadoop: [ "3.3" ] + numpy: [ "numpy" ] + java_home: [ "/usr/lib/jvm/java-11-openjdk-amd64" ] + analytics: [ "false" ] name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} env: JAVA_HOME: ${{ matrix.java_home }} SPARK_VERSION: ${{ matrix.spark }} HADOOP_VERSION: ${{ matrix.hadoop }} - SPARK_DIRECTORY: ${{ github.workspace }}/../ - SPARK_HOME: ${{ github.workspace }}/../spark/ + SPARK_HOME: ${{ github.workspace }}/spark YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }} + steps: - uses: actions/checkout@v4 - - name: Setup python + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openjdk-11-jdk curl tar + + - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: x64 - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Linux') + + - name: Cache pip dependencies + uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip-\ - - uses: actions/cache@v4 - if: startsWith(runner.os, 'macOS') - with: - path: ~/Library/Caches/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + key: ${{ runner.os }}-pip-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Windows') - with: - path: ~\AppData\Local\pip\Cache - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - run: | + ${{ runner.os }}-pip- + + - name: Install Python Dependencies + run: | pip install --upgrade pip setuptools wheel - pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" + pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" --no-cache-dir + pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" --no-cache-dir pip install ".[test]" - pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" + + - name: Download and Install Spark + run: | + SPARK_TGZ="spark-${{ matrix.spark }}-bin-hadoop${{ matrix.hadoop }}.tgz" + SPARK_URL="https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/${SPARK_TGZ}" + curl -sL "$SPARK_URL" | tar xz + mv spark-* $SPARK_HOME + echo "SPARK_HOME=${SPARK_HOME}" >> $GITHUB_ENV + echo "PATH=${SPARK_HOME}/bin:$PATH" >> $GITHUB_ENV + - if: ${{ matrix.spark != '3.0.1' }} run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV + - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV + - run: make install - - run: make install-spark-ci - - run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything - run: make test_spark + From cec71998822a1d4fd3dfb00c8ae5eed391277736 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:03:18 -0700 Subject: [PATCH 03/25] chore: fix spark CI/CD --- .github/workflows/tests.yml | 69 ++++++++++++++----------------------- 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 98bac977f..ff4ff3447 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -122,34 +122,26 @@ jobs: - run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }} test_spark: - runs-on: ${{ matrix.os }} - continue-on-error: true + runs-on: ubuntu-22.04 + continue-on-error: false strategy: matrix: - os: [ ubuntu-22.04 ] - python-version: [ "3.9", "3.10", "3.11" ] - pandas: [ "pandas>1.1" ] - spark: [ "3.4.4", "3.5.0" ] - hadoop: [ "3.3" ] - numpy: [ "numpy" ] - java_home: [ "/usr/lib/jvm/java-11-openjdk-amd64" ] - analytics: [ "false" ] - - name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} - env: - JAVA_HOME: ${{ matrix.java_home }} - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - SPARK_HOME: ${{ github.workspace }}/spark - YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }} + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + pyspark-version: [ "3.4.4", "3.5.0" ] + + name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} steps: - - uses: actions/checkout@v4 + - name: Checkout Code + uses: actions/checkout@v4 - - name: Install system dependencies + - name: Install Java (OpenJDK 11) run: | sudo apt-get update - sudo apt-get install -y openjdk-11-jdk curl tar + sudo apt-get install -y openjdk-11-jdk + echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV + echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV + java -version - name: Setup Python uses: actions/setup-python@v5 @@ -161,32 +153,21 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} + key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} restore-keys: | - ${{ runner.os }}-pip- + pip-${{ runner.os }}- - - name: Install Python Dependencies + - name: Install Dependencies run: | - pip install --upgrade pip setuptools wheel - pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" --no-cache-dir - pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" --no-cache-dir - pip install ".[test]" + python -m pip install --upgrade pip setuptools wheel + pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir + echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV + echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV - - name: Download and Install Spark + - name: Run Tests run: | - SPARK_TGZ="spark-${{ matrix.spark }}-bin-hadoop${{ matrix.hadoop }}.tgz" - SPARK_URL="https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/${SPARK_TGZ}" - curl -sL "$SPARK_URL" | tar xz - mv spark-* $SPARK_HOME - echo "SPARK_HOME=${SPARK_HOME}" >> $GITHUB_ENV - echo "PATH=${SPARK_HOME}/bin:$PATH" >> $GITHUB_ENV - - - if: ${{ matrix.spark != '3.0.1' }} - run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV - - - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV - - - run: make install - - run: make test_spark - + make install + make install-spark-ci + pip install ".[spark]" + make test_spark From 10e1cb9044d77493119791748333b5f95454ac26 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:12:59 -0700 Subject: [PATCH 04/25] chore: remove make spark-ci --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ff4ff3447..fb3916342 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,7 +167,6 @@ jobs: - name: Run Tests run: | make install - make install-spark-ci pip install ".[spark]" make test_spark From ebdb2d4a14ba6237910a68b0cda50b6a7c31c596 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:16:57 -0700 Subject: [PATCH 05/25] chore: add pytest to the dependencies --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fb3916342..5c3ab8ac7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,6 +167,6 @@ jobs: - name: Run Tests run: | make install - pip install ".[spark]" + pip install ".[spark, test]" make test_spark From abb17e42ad4d16d18d33344b5a8f2df2e562ef49 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:23:42 -0700 Subject: [PATCH 06/25] fix: fixing numba version due to visions --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 72f019a1f..83fdd8eea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dependencies = [ "imagehash==4.3.1", "wordcloud>=1.9.3", "dacite>=1.8", - "numba>=0.56.0, <1", + "numba>=0.56.0, <=0.61", ] dynamic = ["version"] From 0f085dcdc85f8417a29178cdc9853095f076e1d9 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:33:03 -0700 Subject: [PATCH 07/25] feat: update pyspark install --- .github/workflows/tests.yml | 2 +- pyproject.toml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5c3ab8ac7..08136a8f5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,6 +167,6 @@ jobs: - name: Run Tests run: | make install - pip install ".[spark, test]" + pip install ".[test]" make test_spark diff --git a/pyproject.toml b/pyproject.toml index 83fdd8eea..9f5814c8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,10 +84,10 @@ dev = [ # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly spark = [ - "pyspark>=2.3.0", - "pyarrow>=2.0.0", - "pandas>1.1, <2, !=1.4.0", - "numpy>=1.16.0,<1.24", + "pyspark>=3.0.*", + "pyarrow>=4.0.0", + "pandas>1.1", + "numpy>=1.16.0", "visions[type_image_path]>=0.7.5, <0.7.7", ] test = [ From 443e4421032dd11443c828f8ed75de77dc0b2cda Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:36:57 -0700 Subject: [PATCH 08/25] fix: pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9f5814c8d..508f50c18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ dev = [ # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly spark = [ - "pyspark>=3.0.*", + "pyspark>=3.0", "pyarrow>=4.0.0", "pandas>1.1", "numpy>=1.16.0", From d55d4a65ff50452fb49e6735d69506caf255bd44 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:40:48 -0700 Subject: [PATCH 09/25] chore: fix makefile to run the tests --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 46a4a28d5..22a69c758 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ test: ydata_profiling -h test_spark: - pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/ + pytest tests/backends/spark_backend/ ydata_profiling -h test_cov: @@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies install-spark-ci: sudo apt-get update - sudo apt-get -y install openjdk-8-jdk + sudo apt-get -y install openjdk-11-jdk curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ --output ${SPARK_DIRECTORY}/spark.tgz cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark From a881b01ef6487e0d51f98e48f9dd44f480cde49c Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 17:09:19 -0700 Subject: [PATCH 10/25] chore: tests for pyspark versions bigger than 3.4 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 08136a8f5..17920d72b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,7 +127,7 @@ jobs: strategy: matrix: python-version: [ "3.9", "3.10", "3.11", "3.12" ] - pyspark-version: [ "3.4.4", "3.5.0" ] + pyspark-version: [ "3.5.0" ] name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} From 7d702f5499288aa29725e27aacd882e69af1f2bc Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 21:51:45 -0700 Subject: [PATCH 11/25] fix: add other pyspark versions to the tests --- .github/workflows/tests.yml | 2 +- .../spark_backend/test_descriptions_spark.py | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 17920d72b..ce45742f6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,7 +127,7 @@ jobs: strategy: matrix: python-version: [ "3.9", "3.10", "3.11", "3.12" ] - pyspark-version: [ "3.5.0" ] + pyspark-version: [ "3.4" , "3.5" ] name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index c11330017..20a6ac18d 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -7,6 +7,7 @@ from ydata_profiling.config import SparkSettings from ydata_profiling.model.describe import describe +from pyspark.sql.types import TimestampType check_is_NaN = "ydata_profiling.check_is_NaN" @@ -41,15 +42,15 @@ def describe_data(): "s1": np.ones(9), "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)], "somedate": [ - datetime.datetime(2011, 7, 4), - datetime.datetime(2022, 1, 1, 13, 57), - datetime.datetime(1990, 12, 9), + datetime.date(2011, 7, 4), + datetime.date(2011, 7, 2), + datetime.date(1990, 12, 9), pd.NaT, - datetime.datetime(1990, 12, 9), - datetime.datetime(1970, 12, 9), - datetime.datetime(1972, 1, 2), - datetime.datetime(1970, 12, 9), - datetime.datetime(1970, 12, 9), + datetime.date(1990, 12, 9), + datetime.date(1970, 12, 9), + datetime.date(1972, 1, 2), + datetime.date(1970, 12, 9), + datetime.date(1970, 12, 9), ], "bool_tf": [True, True, False, True, False, True, True, False, True], "bool_tf_with_nan": [ @@ -370,13 +371,15 @@ def test_describe_spark_df( if column == "mixed": describe_data[column] = [str(i) for i in describe_data[column]] - if column == "bool_tf_with_nan": + elif column == "bool_tf_with_nan": describe_data[column] = [ True if i else False for i in describe_data[column] # noqa: SIM210 ] + pdf= pd.DataFrame({column: describe_data[column]})# Convert to Pandas DataFrame # Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns) pdf = pdf.where(pd.notna(pdf), None) + sdf = spark_session.createDataFrame(pdf) results = describe(cfg, sdf, summarizer_spark, typeset) From f8c06e8fb3e6aafddafa49777d701fa72822b44b Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 22:01:07 -0700 Subject: [PATCH 12/25] chore: use ubuntu-22.04 --- .github/workflows/pull-request.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 16c628479..e95859fa8 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -9,7 +9,7 @@ on: jobs: commitlint: name: Lint commit message - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -21,7 +21,7 @@ jobs: lint: if: github.actor != 'renovate[bot]' name: Lint source code - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -85,7 +85,7 @@ jobs: validate-docs: name: Validate Docs - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 From 7bd1069391130137fd68d98483e7061373557fa6 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:37:49 -0700 Subject: [PATCH 13/25] chore: update test pipeline to run higher spark version --- .github/workflows/tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4f7b054a0..f3cc345c1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,12 +127,12 @@ jobs: strategy: matrix: os: [ ubuntu-22.04 ] - python-version: [3.8] + python-version: ["3.9", "3.10", "3.11"] pandas: ["pandas>1.1"] - spark: ["3.0.1"] - hadoop: [ 2.7 ] + spark: ["3.4.4", "3.5.5"] + hadoop: [ 3.7 ] numpy: ["numpy"] - java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ] + java_home: [ /usr/lib/jvm/java-11-openjdk-amd64 ] name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} env: @@ -172,7 +172,7 @@ jobs: ${{ runner.os }}-${{ matrix.pandas }}-pip- - run: | pip install --upgrade pip setuptools wheel - pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}" + pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" pip install ".[test]" pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" - if: ${{ matrix.spark != '3.0.1' }} From adde3473e94f95a035fda8239cc09b7794da6855 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:46:55 -0700 Subject: [PATCH 14/25] chore: update spark test CI/CD pipeline. --- .github/workflows/tests.yml | 73 ++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f3cc345c1..98bac977f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -123,63 +123,70 @@ jobs: test_spark: runs-on: ${{ matrix.os }} - continue-on-error: True + continue-on-error: true strategy: matrix: os: [ ubuntu-22.04 ] - python-version: ["3.9", "3.10", "3.11"] - pandas: ["pandas>1.1"] - spark: ["3.4.4", "3.5.5"] - hadoop: [ 3.7 ] - numpy: ["numpy"] - java_home: [ /usr/lib/jvm/java-11-openjdk-amd64 ] + python-version: [ "3.9", "3.10", "3.11" ] + pandas: [ "pandas>1.1" ] + spark: [ "3.4.4", "3.5.0" ] + hadoop: [ "3.3" ] + numpy: [ "numpy" ] + java_home: [ "/usr/lib/jvm/java-11-openjdk-amd64" ] + analytics: [ "false" ] name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} env: JAVA_HOME: ${{ matrix.java_home }} SPARK_VERSION: ${{ matrix.spark }} HADOOP_VERSION: ${{ matrix.hadoop }} - SPARK_DIRECTORY: ${{ github.workspace }}/../ - SPARK_HOME: ${{ github.workspace }}/../spark/ + SPARK_HOME: ${{ github.workspace }}/spark YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }} + steps: - uses: actions/checkout@v4 - - name: Setup python + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openjdk-11-jdk curl tar + + - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: x64 - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Linux') + + - name: Cache pip dependencies + uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip-\ - - uses: actions/cache@v4 - if: startsWith(runner.os, 'macOS') - with: - path: ~/Library/Caches/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + key: ${{ runner.os }}-pip-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Windows') - with: - path: ~\AppData\Local\pip\Cache - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - run: | + ${{ runner.os }}-pip- + + - name: Install Python Dependencies + run: | pip install --upgrade pip setuptools wheel - pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" + pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" --no-cache-dir + pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" --no-cache-dir pip install ".[test]" - pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" + + - name: Download and Install Spark + run: | + SPARK_TGZ="spark-${{ matrix.spark }}-bin-hadoop${{ matrix.hadoop }}.tgz" + SPARK_URL="https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/${SPARK_TGZ}" + curl -sL "$SPARK_URL" | tar xz + mv spark-* $SPARK_HOME + echo "SPARK_HOME=${SPARK_HOME}" >> $GITHUB_ENV + echo "PATH=${SPARK_HOME}/bin:$PATH" >> $GITHUB_ENV + - if: ${{ matrix.spark != '3.0.1' }} run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV + - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV + - run: make install - - run: make install-spark-ci - - run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything - run: make test_spark + From 515919757365500864ba3a524f292813a7e124a0 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:03:18 -0700 Subject: [PATCH 15/25] chore: fix spark CI/CD --- .github/workflows/tests.yml | 69 ++++++++++++++----------------------- 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 98bac977f..ff4ff3447 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -122,34 +122,26 @@ jobs: - run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }} test_spark: - runs-on: ${{ matrix.os }} - continue-on-error: true + runs-on: ubuntu-22.04 + continue-on-error: false strategy: matrix: - os: [ ubuntu-22.04 ] - python-version: [ "3.9", "3.10", "3.11" ] - pandas: [ "pandas>1.1" ] - spark: [ "3.4.4", "3.5.0" ] - hadoop: [ "3.3" ] - numpy: [ "numpy" ] - java_home: [ "/usr/lib/jvm/java-11-openjdk-amd64" ] - analytics: [ "false" ] - - name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} - env: - JAVA_HOME: ${{ matrix.java_home }} - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - SPARK_HOME: ${{ github.workspace }}/spark - YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }} + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + pyspark-version: [ "3.4.4", "3.5.0" ] + + name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} steps: - - uses: actions/checkout@v4 + - name: Checkout Code + uses: actions/checkout@v4 - - name: Install system dependencies + - name: Install Java (OpenJDK 11) run: | sudo apt-get update - sudo apt-get install -y openjdk-11-jdk curl tar + sudo apt-get install -y openjdk-11-jdk + echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV + echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV + java -version - name: Setup Python uses: actions/setup-python@v5 @@ -161,32 +153,21 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} + key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} restore-keys: | - ${{ runner.os }}-pip- + pip-${{ runner.os }}- - - name: Install Python Dependencies + - name: Install Dependencies run: | - pip install --upgrade pip setuptools wheel - pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" --no-cache-dir - pip install pyarrow>4.0.0 pyspark=="${{ matrix.spark }}" --no-cache-dir - pip install ".[test]" + python -m pip install --upgrade pip setuptools wheel + pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir + echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV + echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV - - name: Download and Install Spark + - name: Run Tests run: | - SPARK_TGZ="spark-${{ matrix.spark }}-bin-hadoop${{ matrix.hadoop }}.tgz" - SPARK_URL="https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/${SPARK_TGZ}" - curl -sL "$SPARK_URL" | tar xz - mv spark-* $SPARK_HOME - echo "SPARK_HOME=${SPARK_HOME}" >> $GITHUB_ENV - echo "PATH=${SPARK_HOME}/bin:$PATH" >> $GITHUB_ENV - - - if: ${{ matrix.spark != '3.0.1' }} - run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV - - - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV - - - run: make install - - run: make test_spark - + make install + make install-spark-ci + pip install ".[spark]" + make test_spark From 1a01cf26628ef35c0db55f5d5c2f9ab26fcf615b Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:12:59 -0700 Subject: [PATCH 16/25] chore: remove make spark-ci --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ff4ff3447..fb3916342 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,7 +167,6 @@ jobs: - name: Run Tests run: | make install - make install-spark-ci pip install ".[spark]" make test_spark From ad03b8479732cf218370897ed3dcbd43b4687ad2 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:16:57 -0700 Subject: [PATCH 17/25] chore: add pytest to the dependencies --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fb3916342..5c3ab8ac7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,6 +167,6 @@ jobs: - name: Run Tests run: | make install - pip install ".[spark]" + pip install ".[spark, test]" make test_spark From af1a17f43fa76b64151dbd206ffc13bee35f9405 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:23:42 -0700 Subject: [PATCH 18/25] fix: fixing numba version due to visions --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 72f019a1f..83fdd8eea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dependencies = [ "imagehash==4.3.1", "wordcloud>=1.9.3", "dacite>=1.8", - "numba>=0.56.0, <1", + "numba>=0.56.0, <=0.61", ] dynamic = ["version"] From f8b070263af357bbd8d1784c75fb9adc7fdda2aa Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:33:03 -0700 Subject: [PATCH 19/25] feat: update pyspark install --- .github/workflows/tests.yml | 2 +- pyproject.toml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5c3ab8ac7..08136a8f5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -167,6 +167,6 @@ jobs: - name: Run Tests run: | make install - pip install ".[spark, test]" + pip install ".[test]" make test_spark diff --git a/pyproject.toml b/pyproject.toml index 83fdd8eea..9f5814c8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,10 +84,10 @@ dev = [ # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly spark = [ - "pyspark>=2.3.0", - "pyarrow>=2.0.0", - "pandas>1.1, <2, !=1.4.0", - "numpy>=1.16.0,<1.24", + "pyspark>=3.0.*", + "pyarrow>=4.0.0", + "pandas>1.1", + "numpy>=1.16.0", "visions[type_image_path]>=0.7.5, <0.7.7", ] test = [ From 439fdbcb9e1489fcc015e107a3db3d2285a1a645 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:36:57 -0700 Subject: [PATCH 20/25] fix: pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9f5814c8d..508f50c18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ dev = [ # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly spark = [ - "pyspark>=3.0.*", + "pyspark>=3.0", "pyarrow>=4.0.0", "pandas>1.1", "numpy>=1.16.0", From b5916ae3e6b67bb60ccab1ed3110802fcf7c0646 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:40:48 -0700 Subject: [PATCH 21/25] chore: fix makefile to run the tests --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 46a4a28d5..22a69c758 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ test: ydata_profiling -h test_spark: - pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/ + pytest tests/backends/spark_backend/ ydata_profiling -h test_cov: @@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies install-spark-ci: sudo apt-get update - sudo apt-get -y install openjdk-8-jdk + sudo apt-get -y install openjdk-11-jdk curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ --output ${SPARK_DIRECTORY}/spark.tgz cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark From 53383c4f3e57683e0858b979783e47e2f74a4828 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 17:09:19 -0700 Subject: [PATCH 22/25] chore: tests for pyspark versions bigger than 3.4 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 08136a8f5..17920d72b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,7 +127,7 @@ jobs: strategy: matrix: python-version: [ "3.9", "3.10", "3.11", "3.12" ] - pyspark-version: [ "3.4.4", "3.5.0" ] + pyspark-version: [ "3.5.0" ] name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} From 243bf1759b8665753537a9929c1b399adec5e37f Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 21:51:45 -0700 Subject: [PATCH 23/25] fix: add other pyspark versions to the tests --- .github/workflows/tests.yml | 2 +- .../spark_backend/test_descriptions_spark.py | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 17920d72b..ce45742f6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -127,7 +127,7 @@ jobs: strategy: matrix: python-version: [ "3.9", "3.10", "3.11", "3.12" ] - pyspark-version: [ "3.5.0" ] + pyspark-version: [ "3.4" , "3.5" ] name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index c11330017..20a6ac18d 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -7,6 +7,7 @@ from ydata_profiling.config import SparkSettings from ydata_profiling.model.describe import describe +from pyspark.sql.types import TimestampType check_is_NaN = "ydata_profiling.check_is_NaN" @@ -41,15 +42,15 @@ def describe_data(): "s1": np.ones(9), "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)], "somedate": [ - datetime.datetime(2011, 7, 4), - datetime.datetime(2022, 1, 1, 13, 57), - datetime.datetime(1990, 12, 9), + datetime.date(2011, 7, 4), + datetime.date(2011, 7, 2), + datetime.date(1990, 12, 9), pd.NaT, - datetime.datetime(1990, 12, 9), - datetime.datetime(1970, 12, 9), - datetime.datetime(1972, 1, 2), - datetime.datetime(1970, 12, 9), - datetime.datetime(1970, 12, 9), + datetime.date(1990, 12, 9), + datetime.date(1970, 12, 9), + datetime.date(1972, 1, 2), + datetime.date(1970, 12, 9), + datetime.date(1970, 12, 9), ], "bool_tf": [True, True, False, True, False, True, True, False, True], "bool_tf_with_nan": [ @@ -370,13 +371,15 @@ def test_describe_spark_df( if column == "mixed": describe_data[column] = [str(i) for i in describe_data[column]] - if column == "bool_tf_with_nan": + elif column == "bool_tf_with_nan": describe_data[column] = [ True if i else False for i in describe_data[column] # noqa: SIM210 ] + pdf= pd.DataFrame({column: describe_data[column]})# Convert to Pandas DataFrame # Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns) pdf = pdf.where(pd.notna(pdf), None) + sdf = spark_session.createDataFrame(pdf) results = describe(cfg, sdf, summarizer_spark, typeset) From 48cb72f53043082f0cc7c731fc60cc61ceae7700 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Mon, 17 Mar 2025 22:01:07 -0700 Subject: [PATCH 24/25] chore: use ubuntu-22.04 --- .github/workflows/pull-request.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index d51d14932..d93940070 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -9,7 +9,7 @@ on: jobs: commitlint: name: Lint commit message - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -21,7 +21,7 @@ jobs: lint: if: github.actor != 'renovate[bot]' name: Lint source code - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -85,7 +85,7 @@ jobs: validate-docs: name: Validate Docs - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 From dfa8987610b80727f8c1823473e6152b01c8ec28 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 11:06:15 -0700 Subject: [PATCH 25/25] chore: remove unused timestamp --- tests/backends/spark_backend/test_descriptions_spark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index cf684f369..24de9afbb 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -7,7 +7,6 @@ from ydata_profiling.config import SparkSettings from ydata_profiling.model.describe import describe -from pyspark.sql.types import TimestampType check_is_NaN = "ydata_profiling.check_is_NaN"