Skip to content
Open

hh #2

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ COPY Makefile /tmp/
RUN su $USER -c "/usr/bin/python3 -m venv /home/${USER}/venv" \
&& su $USER -c "${VENV_PATH}/bin/pip --disable-pip-version-check --no-cache-dir install -r /tmp/requirements.txt" \
&& rm -rf /tmp/requirements.txt

25 changes: 7 additions & 18 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,17 @@
// https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/codespaces-linux
{
"name": "GitHub Codespaces (Default)",

"build": {
"dockerfile": "Dockerfile",
"context": ".."
},
"features": {
"ghcr.io/devcontainers/features/nvidia-cuda:1": {
"installCudnn": true
}
},

"features": {},
// Configure tool-specific properties.
"customizations": {
// Configure properties specific to VS Code.
"vscode": {
// Set *default* container specific settings.json values on container create.
"settings": {
"settings": {
"go.toolsManagement.checkForUpdates": "local",
"go.useLanguageServer": true,
"go.gopath": "/go",
Expand All @@ -39,7 +33,6 @@
"**/target/**": true
}
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"GitHub.vscode-pull-request-github",
Expand All @@ -55,25 +48,21 @@
]
}
},

"remoteUser": "codespace",

"overrideCommand": false,

"mounts": ["source=codespaces-linux-var-lib-docker,target=/var/lib/docker,type=volume"],

"mounts": [
"source=codespaces-linux-var-lib-docker,target=/var/lib/docker,type=volume"
],
"runArgs": [
"--cap-add=SYS_PTRACE",
"--security-opt",
"seccomp=unconfined",
"--privileged",
"--init"
],

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// "oryx build" will automatically install your dependencies and attempt to build your project
//"postCreateCommand": "oryx build -p virtualenv_name=.venv --log-file /tmp/oryx-build.log --manifest-dir /tmp || echo 'Could not auto-build. Skipping.'"
"postCreateCommand": "bash setup.sh"
}
"postCreateCommand": "bash setup.sh"
}
2 changes: 2 additions & 0 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,7 @@ jobs:
run: make test
- name: format
run: make format
- name: generate_and_push
run: make generate_and_push
- name: deploy
run: make deploy
82 changes: 30 additions & 52 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
#ignore huggingface
summarizeApp
#ignore fine-tuning
test_trainer/
# Ignore data folder except US_birth.csv
data/*
!data/US_birth.csv

#ignore pytorch artifacts
data
model.pth
# Ignore other temporary or unnecessary files

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
Expand All @@ -37,8 +31,6 @@ share/python-wheels/
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

Expand All @@ -60,56 +52,17 @@ coverage.xml
.hypothesis/
.pytest_cache/

# Translations
# Translation files
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
Expand All @@ -119,6 +72,19 @@ ENV/
env.bak/
venv.bak/

# Flask instance folder
instance/
.webassets-cache

# Scrapy
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Spyder project settings
.spyderproject
.spyproject
Expand All @@ -136,3 +102,15 @@ dmypy.json

# Pyre type checker
.pyre/

# pyenv
.python-version

# pipenv
#Pipfile.lock

# PEP 582
__pypackages__/

# Docker
**/Dockerfile
32 changes: 22 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,25 +1,37 @@
install:
pip install --upgrade pip &&\
pip install --upgrade pip &&\
pip install -r requirements.txt

test:
python -m pytest -vv --cov=main --cov=mylib test_*.py
python -m pytest -vv --cov=mylib test_*.py

format:
black *.py
format:
black *.py

lint:
#disable comment to test speed
#pylint --disable=R,C --ignore-patterns=test_.*?py *.py mylib/*.py
#ruff linting is 10-100X faster than pylint
ruff check *.py mylib/*.py
# Uncomment below to test with pylint
# pylint --disable=R,C --ignore-patterns=test_.*?py *.py mylib/*.py
# ruff linting is 10-100X faster than pylint
ruff check *.py mylib/*.py

container-lint:
docker run --rm -i hadolint/hadolint < Dockerfile
docker run --rm -i hadolint/hadolint < Dockerfile

refactor: format lint

deploy:
#deploy goes here
# Deploy commands go here

all: install lint test format deploy

generate_and_push:
# Add, commit, and push the generated files to GitHub
@if [ -n "$$(git status --porcelain)" ]; then \
git config --local user.email "action@github.com"; \
git config --local user.name "GitHub Action"; \
git add .; \
git commit -m "Add output log"; \
git push; \
else \
echo "No changes to commit. Skipping commit and push."; \
fi
69 changes: 34 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
[![CI](https://github.com/nogibjj/python-ruff-template/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/python-ruff-template/actions/workflows/cicd.yml)
## Template for Python projects with RUFF linter

![1 15_rust_built_python_tools](https://github.com/nogibjj/python-ruff-template/assets/58792/db5f7bda-a977-4c67-acbe-a70fe034fbdf)



1. First thing to do on launch is to open a new shell and verify virtualenv is sourced.

Things included are:

* `Makefile`

* `Pytest`

* `pandas`

* `Ruff`:

Run `make lint` which runs `ruff check`. You can find out more info on [Ruff here](https://github.com/astral-sh/ruff).

* `Dockerfile`

* `GitHub copilot`

* `jupyter` and `ipython`

* A base set of libraries for devops and web

* `githubactions`

[![CI](https://github.com/jayliu1016/ids_de_mini10/actions/workflows/cicd.yml/badge.svg)](https://github.com/jayliu1016/ids_de_mini10/actions/workflows/cicd.yml)

# PySpark Data Processing Project
## Project Overview
This project focuses on utilizing PySpark for efficient data processing on the US_birth.csv dataset. The primary objectives are to incorporate Spark SQL queries, perform data transformations, and generate a summary of descriptive statistics. The process involves extracting, transforming, and querying the dataset, followed by creating a formatted summary report.

## Dataset
We use the US_birth.csv dataset, which contains data on U.S. births. This dataset will be processed in a PySpark environment to demonstrate Spark's powerful data processing capabilities on large-scale data.

## Getting Started
### Environment Setup
1. Open Codespaces or your preferred IDE.
2. Wait for the environment setup to complete.

## Output
Spark Output Data: Generated from PySpark transformations.
Summary Markdown File: A summary file containing key statistics and insights from the dataset.
## Code Formatting and Linting
This project follows code quality standards for readability and maintenance.


## Process Overview
Data Extraction: Extracts the dataset using the extract function.
Spark Session: Initializes a Spark session via start_spark.
Data Loading: Loads the dataset into a Spark DataFrame with load_data.
Descriptive Statistics: Generates summary statistics using describe.
Query Execution: Executes a SQL query on the dataset via query.
Data Transformation: Performs additional transformations with example_transform.
End Spark Session: Closes the Spark session with end_spark.
## References

![1 1-function-essence-of-programming](https://github.com/nogibjj/python-ruff-template/assets/58792/f7f33cd3-cff5-4014-98ea-09b6a29c7557)


PySpark Template: https://github.com/nogibjj/python-ruff-template
Original Dataset Source: https://github.com/fivethirtyeight/data/tree/master/daily-show-guests
## GitHub Actions
This project includes GitHub Actions for CI/CD, automatically running tests and code formatting checks on each push.
echo "Trigger CI/CD workflow" >> README.md

Loading