nogibjj · jayliu1016 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -16,4 +16,3 @@ COPY Makefile /tmp/
 RUN su $USER -c "/usr/bin/python3 -m venv /home/${USER}/venv" \
    && su $USER -c "${VENV_PATH}/bin/pip --disable-pip-version-check --no-cache-dir install -r /tmp/requirements.txt" \
    && rm -rf /tmp/requirements.txt 
-
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,23 +2,17 @@
 // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/codespaces-linux
 {
 	"name": "GitHub Codespaces (Default)",
-
 	"build": {
 		"dockerfile": "Dockerfile",
 		"context": ".."
 	},
-	"features": {
-		"ghcr.io/devcontainers/features/nvidia-cuda:1": { 
-		  "installCudnn": true
-		}
-	  },
-
+	"features": {},
 	// Configure tool-specific properties.
 	"customizations": {
 		// Configure properties specific to VS Code.
 		"vscode": {
 			// Set *default* container specific settings.json values on container create.
-			"settings": { 
+			"settings": {
 				"go.toolsManagement.checkForUpdates": "local",
 				"go.useLanguageServer": true,
 				"go.gopath": "/go",
@@ -39,7 +33,6 @@
 					"**/target/**": true
 				}
 			},
-
 			// Add the IDs of extensions you want installed when the container is created.
 			"extensions": [
 				"GitHub.vscode-pull-request-github",
@@ -55,25 +48,21 @@
 			]
 		}
 	},
-
 	"remoteUser": "codespace",
-
 	"overrideCommand": false,
-
-	"mounts": ["source=codespaces-linux-var-lib-docker,target=/var/lib/docker,type=volume"],
-
+	"mounts": [
+		"source=codespaces-linux-var-lib-docker,target=/var/lib/docker,type=volume"
+	],
 	"runArgs": [
 		"--cap-add=SYS_PTRACE",
 		"--security-opt",
 		"seccomp=unconfined",
 		"--privileged",
 		"--init"
 	],
-
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	// "forwardPorts": [],
-
 	// "oryx build" will automatically install your dependencies and attempt to build your project
 	//"postCreateCommand": "oryx build -p virtualenv_name=.venv --log-file /tmp/oryx-build.log --manifest-dir /tmp || echo 'Could not auto-build. Skipping.'"
-	 "postCreateCommand": "bash setup.sh"
-}
+	"postCreateCommand": "bash setup.sh"
+}
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -19,5 +19,7 @@ jobs:
         run: make test
       - name: format
         run: make format
+      - name: generate_and_push
+        run: make generate_and_push
       - name: deploy
         run: make deploy
diff --git a/.gitignore b/.gitignore
@@ -1,20 +1,14 @@
-#ignore huggingface
-summarizeApp
-#ignore fine-tuning
-test_trainer/
+# Ignore data folder except US_birth.csv
+data/*
+!data/US_birth.csv
 
-#ignore pytorch artifacts
-data
-model.pth
+# Ignore other temporary or unnecessary files
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
-*.so
-
 # Distribution / packaging
 .Python
 build/
@@ -37,8 +31,6 @@ share/python-wheels/
 MANIFEST
 
 # PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 
@@ -60,56 +52,17 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 
-# Translations
+# Translation files
 *.mo
 *.pot
 
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
 # Jupyter Notebook
 .ipynb_checkpoints
 
 # IPython
 profile_default/
 ipython_config.py
 
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
 # Environments
 .env
 .venv
@@ -119,6 +72,19 @@ ENV/
 env.bak/
 venv.bak/
 
+# Flask instance folder
+instance/
+.webassets-cache
+
+# Scrapy
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -136,3 +102,15 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# pyenv
+.python-version
+
+# pipenv
+#Pipfile.lock
+
+# PEP 582
+__pypackages__/
+
+# Docker
+**/Dockerfile
diff --git a/Makefile b/Makefile
@@ -1,25 +1,37 @@
 install:
-	pip install --upgrade pip &&\
+		pip install --upgrade pip &&\
 		pip install -r requirements.txt
 
 test:
-	python -m pytest -vv --cov=main --cov=mylib test_*.py
+		python -m pytest -vv --cov=mylib test_*.py
 
-format:	
-	black *.py 
+format:
+		black *.py 
 
 lint:
-	#disable comment to test speed
-	#pylint --disable=R,C --ignore-patterns=test_.*?py *.py mylib/*.py
-	#ruff linting is 10-100X faster than pylint
-	ruff check *.py mylib/*.py
+		# Uncomment below to test with pylint
+		# pylint --disable=R,C --ignore-patterns=test_.*?py *.py mylib/*.py
+		# ruff linting is 10-100X faster than pylint
+		ruff check *.py mylib/*.py
 
 container-lint:
-	docker run --rm -i hadolint/hadolint < Dockerfile
+		docker run --rm -i hadolint/hadolint < Dockerfile
 
 refactor: format lint
 
 deploy:
-	#deploy goes here
+		# Deploy commands go here
 
 all: install lint test format deploy
+
+generate_and_push:
+		# Add, commit, and push the generated files to GitHub
+		@if [ -n "$$(git status --porcelain)" ]; then \
+				git config --local user.email "action@github.com"; \
+				git config --local user.name "GitHub Action"; \
+				git add .; \
+				git commit -m "Add output log"; \
+				git push; \
+		else \
+				echo "No changes to commit. Skipping commit and push."; \
+		fi
diff --git a/README.md b/README.md
@@ -1,37 +1,36 @@
-[![CI](https://github.com/nogibjj/python-ruff-template/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/python-ruff-template/actions/workflows/cicd.yml)
-## Template for Python projects with RUFF linter
-
-![1 15_rust_built_python_tools](https://github.com/nogibjj/python-ruff-template/assets/58792/db5f7bda-a977-4c67-acbe-a70fe034fbdf)
-
-
-
-1. First thing to do on launch is to open a new shell and verify virtualenv is sourced.
-
-Things included are:
-
-* `Makefile`
-
-* `Pytest`
-
-* `pandas`
-
-* `Ruff`:  
-
-Run `make lint` which runs `ruff check`.  You can find out more info on [Ruff here](https://github.com/astral-sh/ruff).
-
-* `Dockerfile`
-
-* `GitHub copilot`
-
-* `jupyter` and `ipython` 
-
-* A base set of libraries for devops and web
-
-* `githubactions`
-
+[![CI](https://github.com/jayliu1016/ids_de_mini10/actions/workflows/cicd.yml/badge.svg)](https://github.com/jayliu1016/ids_de_mini10/actions/workflows/cicd.yml)
+
+# PySpark Data Processing Project
+## Project Overview
+This project focuses on utilizing PySpark for efficient data processing on the US_birth.csv dataset. The primary objectives are to incorporate Spark SQL queries, perform data transformations, and generate a summary of descriptive statistics. The process involves extracting, transforming, and querying the dataset, followed by creating a formatted summary report.
+
+## Dataset
+We use the US_birth.csv dataset, which contains data on U.S. births. This dataset will be processed in a PySpark environment to demonstrate Spark's powerful data processing capabilities on large-scale data.
+
+## Getting Started
+### Environment Setup
+1. Open Codespaces or your preferred IDE.
+2. Wait for the environment setup to complete.
+
+## Output
+Spark Output Data: Generated from PySpark transformations.
+Summary Markdown File: A summary file containing key statistics and insights from the dataset.
+## Code Formatting and Linting
+This project follows code quality standards for readability and maintenance.
+
+
+## Process Overview
+Data Extraction: Extracts the dataset using the extract function.
+Spark Session: Initializes a Spark session via start_spark.
+Data Loading: Loads the dataset into a Spark DataFrame with load_data.
+Descriptive Statistics: Generates summary statistics using describe.
+Query Execution: Executes a SQL query on the dataset via query.
+Data Transformation: Performs additional transformations with example_transform.
+End Spark Session: Closes the Spark session with end_spark.
 ## References
-
-![1 1-function-essence-of-programming](https://github.com/nogibjj/python-ruff-template/assets/58792/f7f33cd3-cff5-4014-98ea-09b6a29c7557)
-
-
+PySpark Template: https://github.com/nogibjj/python-ruff-template
+Original Dataset Source: https://github.com/fivethirtyeight/data/tree/master/daily-show-guests
+## GitHub Actions
+This project includes GitHub Actions for CI/CD, automatically running tests and code formatting checks on each push.
+echo "Trigger CI/CD workflow" >> README.md
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,4 +16,3 @@ COPY Makefile /tmp/
		RUN su $USER -c "/usr/bin/python3 -m venv /home/${USER}/venv" \
		&& su $USER -c "${VENV_PATH}/bin/pip --disable-pip-version-check --no-cache-dir install -r /tmp/requirements.txt" \
		&& rm -rf /tmp/requirements.txt