diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index c66e0317c..11358bd3f 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -9,7 +9,7 @@ on: jobs: commitlint: name: Lint commit message - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -21,7 +21,7 @@ jobs: lint: if: github.actor != 'renovate[bot]' name: Lint source code - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -85,7 +85,7 @@ jobs: validate-docs: name: Validate Docs - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4f7b054a0..ce45742f6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -122,64 +122,51 @@ jobs: - run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }} test_spark: - runs-on: ${{ matrix.os }} - continue-on-error: True + runs-on: ubuntu-22.04 + continue-on-error: false strategy: matrix: - os: [ ubuntu-22.04 ] - python-version: [3.8] - pandas: ["pandas>1.1"] - spark: ["3.0.1"] - hadoop: [ 2.7 ] - numpy: ["numpy"] - java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ] - - name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }} - env: - JAVA_HOME: ${{ matrix.java_home }} - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - SPARK_DIRECTORY: ${{ github.workspace }}/../ - SPARK_HOME: ${{ github.workspace }}/../spark/ - YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }} + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + pyspark-version: [ "3.4" , "3.5" ] + + name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }} + steps: - - uses: actions/checkout@v4 - - name: Setup python + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Install Java (OpenJDK 11) + run: | + sudo apt-get update + sudo apt-get install -y openjdk-11-jdk + echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV + echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV + java -version + + - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: x64 - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Linux') + + - name: Cache pip dependencies + uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} + key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }} restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip-\ - - uses: actions/cache@v4 - if: startsWith(runner.os, 'macOS') - with: - path: ~/Library/Caches/pip - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - uses: actions/cache@v4 - if: startsWith(runner.os, 'Windows') - with: - path: ~\AppData\Local\pip\Cache - key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.pandas }}-pip- - - run: | - pip install --upgrade pip setuptools wheel - pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}" + pip-${{ runner.os }}- + + - name: Install Dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir + echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV + echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV + + - name: Run Tests + run: | + make install pip install ".[test]" - pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}" - - if: ${{ matrix.spark != '3.0.1' }} - run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV - - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV - - run: make install - - run: make install-spark-ci - - run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything - - run: make test_spark + make test_spark diff --git a/Makefile b/Makefile index 46a4a28d5..22a69c758 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ test: ydata_profiling -h test_spark: - pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/ + pytest tests/backends/spark_backend/ ydata_profiling -h test_cov: @@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies install-spark-ci: sudo apt-get update - sudo apt-get -y install openjdk-8-jdk + sudo apt-get -y install openjdk-11-jdk curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ --output ${SPARK_DIRECTORY}/spark.tgz cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark diff --git a/pyproject.toml b/pyproject.toml index 50e75eb96..e1e668788 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ dependencies = [ "imagehash==4.3.1", "wordcloud>=1.9.3", "dacite>=1.8", - "numba>=0.56.0, <1", + "numba>=0.56.0, <=0.61", ] dynamic = [ @@ -108,10 +108,10 @@ notebook = [ # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly spark = [ - "pyspark>=2.3.0", - "pyarrow>=2.0.0", - "pandas>1.1, <2, !=1.4.0", - "numpy>=1.16.0,<1.24", + "pyspark>=3.0", + "pyarrow>=4.0.0", + "pandas>1.1", + "numpy>=1.16.0", "visions[type_image_path]>=0.7.5, <0.7.7", ] diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index 5c608f2f0..24de9afbb 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -41,15 +41,15 @@ def describe_data(): "s1": np.ones(9), "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)], "somedate": [ - datetime.datetime(2011, 7, 4), - datetime.datetime(2022, 1, 1, 13, 57), - datetime.datetime(1990, 12, 9), + datetime.date(2011, 7, 4), + datetime.date(2011, 7, 2), + datetime.date(1990, 12, 9), pd.NaT, - datetime.datetime(1990, 12, 9), - datetime.datetime(1970, 12, 9), - datetime.datetime(1972, 1, 2), - datetime.datetime(1970, 12, 9), - datetime.datetime(1970, 12, 9), + datetime.date(1990, 12, 9), + datetime.date(1970, 12, 9), + datetime.date(1972, 1, 2), + datetime.date(1970, 12, 9), + datetime.date(1970, 12, 9), ], "bool_tf": [True, True, False, True, False, True, True, False, True], "bool_tf_with_nan": [ @@ -370,13 +370,14 @@ def test_describe_spark_df( if column == "mixed": describe_data[column] = [str(i) for i in describe_data[column]] - if column == "bool_tf_with_nan": + elif column == "bool_tf_with_nan": describe_data[column] = [ True if i else False for i in describe_data[column] # noqa: SIM210 ] pdf = pd.DataFrame({column: describe_data[column]}) # Convert to Pandas DataFrame # Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns) pdf = pdf.where(pd.notna(pdf), None) + sdf = spark_session.createDataFrame(pdf) results = describe(cfg, sdf, summarizer_spark, typeset)