Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
652c090
chore: update test pipeline to run higher spark version
fabclmnt Mar 17, 2025
5e96926
chore: update spark test CI/CD pipeline.
fabclmnt Mar 17, 2025
cec7199
chore: fix spark CI/CD
fabclmnt Mar 17, 2025
10e1cb9
chore: remove make spark-ci
fabclmnt Mar 17, 2025
ebdb2d4
chore: add pytest to the dependencies
fabclmnt Mar 17, 2025
abb17e4
fix: fixing numba version due to visions
fabclmnt Mar 17, 2025
0f085dc
feat: update pyspark install
fabclmnt Mar 17, 2025
443e442
fix: pyproject
fabclmnt Mar 17, 2025
d55d4a6
chore: fix makefile to run the tests
fabclmnt Mar 17, 2025
a881b01
chore: tests for pyspark versions bigger than 3.4
fabclmnt Mar 18, 2025
7d702f5
fix: add other pyspark versions to the tests
fabclmnt Mar 18, 2025
f8c06e8
chore: use ubuntu-22.04
fabclmnt Mar 18, 2025
7bd1069
chore: update test pipeline to run higher spark version
fabclmnt Mar 17, 2025
adde347
chore: update spark test CI/CD pipeline.
fabclmnt Mar 17, 2025
5159197
chore: fix spark CI/CD
fabclmnt Mar 17, 2025
1a01cf2
chore: remove make spark-ci
fabclmnt Mar 17, 2025
ad03b84
chore: add pytest to the dependencies
fabclmnt Mar 17, 2025
af1a17f
fix: fixing numba version due to visions
fabclmnt Mar 17, 2025
f8b0702
feat: update pyspark install
fabclmnt Mar 17, 2025
439fdbc
fix: pyproject
fabclmnt Mar 17, 2025
b5916ae
chore: fix makefile to run the tests
fabclmnt Mar 17, 2025
53383c4
chore: tests for pyspark versions bigger than 3.4
fabclmnt Mar 18, 2025
243bf17
fix: add other pyspark versions to the tests
fabclmnt Mar 18, 2025
48cb72f
chore: use ubuntu-22.04
fabclmnt Mar 18, 2025
3cf7528
Merge branch 'feat/updt_vspark_support' of https://github.com/ydataai…
fabclmnt Mar 18, 2025
c594ca4
Merge branch 'develop' into feat/updt_vspark_support
fabclmnt Mar 18, 2025
3b735b5
Merge branch 'feat/updt_vspark_support' of https://github.com/ydataai…
fabclmnt Mar 18, 2025
dfa8987
chore: remove unused timestamp
fabclmnt Mar 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
commitlint:
name: Lint commit message
runs-on: ubuntu-24.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
Expand All @@ -21,7 +21,7 @@ jobs:
lint:
if: github.actor != 'renovate[bot]'
name: Lint source code
runs-on: ubuntu-24.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -85,7 +85,7 @@ jobs:

validate-docs:
name: Validate Docs
runs-on: ubuntu-24.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
Expand Down
85 changes: 36 additions & 49 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,64 +122,51 @@ jobs:
- run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }}

test_spark:
runs-on: ${{ matrix.os }}
continue-on-error: True
runs-on: ubuntu-22.04
continue-on-error: false
strategy:
matrix:
os: [ ubuntu-22.04 ]
python-version: [3.8]
pandas: ["pandas>1.1"]
spark: ["3.0.1"]
hadoop: [ 2.7 ]
numpy: ["numpy"]
java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ]

name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
env:
JAVA_HOME: ${{ matrix.java_home }}
SPARK_VERSION: ${{ matrix.spark }}
HADOOP_VERSION: ${{ matrix.hadoop }}
SPARK_DIRECTORY: ${{ github.workspace }}/../
SPARK_HOME: ${{ github.workspace }}/../spark/
YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }}
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
pyspark-version: [ "3.4" , "3.5" ]

name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }}

steps:
- uses: actions/checkout@v4
- name: Setup python
- name: Checkout Code
uses: actions/checkout@v4

- name: Install Java (OpenJDK 11)
run: |
sudo apt-get update
sudo apt-get install -y openjdk-11-jdk
echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV
echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV
java -version

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- uses: actions/cache@v4
if: startsWith(runner.os, 'Linux')

- name: Cache pip dependencies
uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-\
- uses: actions/cache@v4
if: startsWith(runner.os, 'macOS')
with:
path: ~/Library/Caches/pip
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-
- uses: actions/cache@v4
if: startsWith(runner.os, 'Windows')
with:
path: ~\AppData\Local\pip\Cache
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-
- run: |
pip install --upgrade pip setuptools wheel
pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}"
pip-${{ runner.os }}-

- name: Install Dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir
echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV

- name: Run Tests
run: |
make install
pip install ".[test]"
pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}"
- if: ${{ matrix.spark != '3.0.1' }}
run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
- run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
- run: make install
- run: make install-spark-ci
- run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything
- run: make test_spark
make test_spark

4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test:
ydata_profiling -h

test_spark:
pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/
pytest tests/backends/spark_backend/
ydata_profiling -h

test_cov:
Expand All @@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies

install-spark-ci:
sudo apt-get update
sudo apt-get -y install openjdk-8-jdk
sudo apt-get -y install openjdk-11-jdk
curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
--output ${SPARK_DIRECTORY}/spark.tgz
cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ dependencies = [
"imagehash==4.3.1",
"wordcloud>=1.9.3",
"dacite>=1.8",
"numba>=0.56.0, <1",
"numba>=0.56.0, <=0.61",
]

dynamic = [
Expand Down Expand Up @@ -108,10 +108,10 @@ notebook = [
# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
spark = [
"pyspark>=2.3.0",
"pyarrow>=2.0.0",
"pandas>1.1, <2, !=1.4.0",
"numpy>=1.16.0,<1.24",
"pyspark>=3.0",
"pyarrow>=4.0.0",
"pandas>1.1",
"numpy>=1.16.0",
"visions[type_image_path]>=0.7.5, <0.7.7",
]

Expand Down
19 changes: 10 additions & 9 deletions tests/backends/spark_backend/test_descriptions_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ def describe_data():
"s1": np.ones(9),
"s2": ["some constant text $ % value {obj} " for _ in range(1, 10)],
"somedate": [
datetime.datetime(2011, 7, 4),
datetime.datetime(2022, 1, 1, 13, 57),
datetime.datetime(1990, 12, 9),
datetime.date(2011, 7, 4),
datetime.date(2011, 7, 2),
datetime.date(1990, 12, 9),
pd.NaT,
datetime.datetime(1990, 12, 9),
datetime.datetime(1970, 12, 9),
datetime.datetime(1972, 1, 2),
datetime.datetime(1970, 12, 9),
datetime.datetime(1970, 12, 9),
datetime.date(1990, 12, 9),
datetime.date(1970, 12, 9),
datetime.date(1972, 1, 2),
datetime.date(1970, 12, 9),
datetime.date(1970, 12, 9),
],
"bool_tf": [True, True, False, True, False, True, True, False, True],
"bool_tf_with_nan": [
Expand Down Expand Up @@ -370,13 +370,14 @@ def test_describe_spark_df(

if column == "mixed":
describe_data[column] = [str(i) for i in describe_data[column]]
if column == "bool_tf_with_nan":
elif column == "bool_tf_with_nan":
describe_data[column] = [
True if i else False for i in describe_data[column] # noqa: SIM210
]
pdf = pd.DataFrame({column: describe_data[column]}) # Convert to Pandas DataFrame
# Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns)
pdf = pdf.where(pd.notna(pdf), None)

sdf = spark_session.createDataFrame(pdf)

results = describe(cfg, sdf, summarizer_spark, typeset)
Expand Down
Loading