From 752d2d9ee95cf3c88a6885792731fafc2393ec6d Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 14:46:02 +0200 Subject: [PATCH 1/6] style(scripts): apply Ruff formatter and import sorting --- krakenparser/__init__.py | 2 +- krakenparser/counts/convert2csv.py | 1 + krakenparser/counts/processing_script.py | 5 +-- krakenparser/counts/split_mpa.py | 30 ++++++++------ krakenparser/kpplot/base.py | 7 ++-- krakenparser/kpplot/clustermap.py | 6 ++- krakenparser/kpplot/stackedbar.py | 8 ++-- krakenparser/kpplot/streamgraph.py | 8 ++-- krakenparser/krakenparser.py | 35 ++++++++++------ krakenparser/mpa/mpa_table.py | 13 ++++-- krakenparser/mpa/transform2mpa.py | 52 ++++++++++++++++-------- krakenparser/pipeline.py | 43 +++++++++++++------- krakenparser/stats/diversity.py | 39 +++++++++++++----- krakenparser/stats/relabund.py | 7 ++-- tests/conftest.py | 38 ++++++++++------- tests/test_full_pipeline.py | 5 ++- tests/test_integration.py | 25 ++++++++---- tests/test_kpplot.py | 19 +++++---- tests/test_units.py | 13 +++--- 19 files changed, 226 insertions(+), 130 deletions(-) diff --git a/krakenparser/__init__.py b/krakenparser/__init__.py index 1c1f724..2905ccb 100755 --- a/krakenparser/__init__.py +++ b/krakenparser/__init__.py @@ -1,6 +1,6 @@ +from .kpplot.clustermap import clustermap from .kpplot.stackedbar import stacked_barplot from .kpplot.streamgraph import streamgraph -from .kpplot.clustermap import clustermap __all__ = [ "stacked_barplot", diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index 835ffc3..1177b91 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -3,6 +3,7 @@ import argparse import logging from pathlib import Path + import pandas as pd _log = logging.getLogger(__name__) diff --git a/krakenparser/counts/processing_script.py b/krakenparser/counts/processing_script.py index b2b28ff..831910e 100755 --- a/krakenparser/counts/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,9 +1,8 @@ #!/usr/bin/env python +import argparse import os -import sys import tempfile -import argparse from pathlib import Path @@ -11,7 +10,7 @@ def modify_taxa_names(line): prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] for prefix in prefixes: if line.startswith(prefix): - parts = line[len(prefix):].split("\t") + parts = line[len(prefix) :].split("\t") parts[0] = parts[0].replace("_", " ") return "\t".join(parts) return line diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py index 72b4e07..0ee87b8 100644 --- a/krakenparser/counts/split_mpa.py +++ b/krakenparser/counts/split_mpa.py @@ -7,7 +7,6 @@ import argparse import logging import re -import sys from pathlib import Path _log = logging.getLogger(__name__) @@ -15,20 +14,20 @@ _RANKS = [ ("species", "s__", []), - ("genus", "g__", ["s__"]), - ("family", "f__", ["s__", "g__"]), - ("order", "o__", ["s__", "g__", "f__"]), - ("class", "c__", ["s__", "g__", "f__", "o__"]), - ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]), + ("genus", "g__", ["s__"]), + ("family", "f__", ["s__", "g__"]), + ("order", "o__", ["s__", "g__", "f__"]), + ("class", "c__", ["s__", "g__", "f__", "o__"]), + ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]), ] _HUMAN_TAXA = { "species": "s__Homo_sapiens", - "genus": "g__Homo", - "family": "f__Hominidae", - "order": "o__Primates", - "class": "c__Mammalia", - "phylum": "p__Chordata", + "genus": "g__Homo", + "family": "f__Hominidae", + "order": "o__Primates", + "class": "c__Mammalia", + "phylum": "p__Chordata", } _ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-") @@ -41,7 +40,7 @@ def _strip_path_prefix(line: str) -> str: return line path, rest = line[:tab], line[tab:] pipe = path.rfind("|") - segment = path[pipe + 1:] if pipe != -1 else path + segment = path[pipe + 1 :] if pipe != -1 else path return _ACCESSION_RE.sub("", segment + rest) @@ -105,7 +104,12 @@ def main() -> None: help="Do not filter human-related taxa (default: filtered)", ) args = parser.parse_args() - split_mpa(args.input, args.output, viruses_only=args.viruses_only, keep_human=args.keep_human) + split_mpa( + args.input, + args.output, + viruses_only=args.viruses_only, + keep_human=args.keep_human, + ) if __name__ == "__main__": diff --git a/krakenparser/kpplot/base.py b/krakenparser/kpplot/base.py index 2df3f0c..8d96aba 100644 --- a/krakenparser/kpplot/base.py +++ b/krakenparser/kpplot/base.py @@ -1,6 +1,7 @@ +from typing import Optional + import matplotlib.pyplot as plt import pandas as pd -from typing import Optional class KpPlotBase: @@ -44,9 +45,7 @@ def aggregate_by_metadata( raise ValueError("metadata must contain 'Sample_id' column") if metadata_group not in metadata.columns: raise ValueError(f"'{metadata_group}' column not found in metadata") - df = df.merge( - metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" - ) + df = df.merge(metadata[["Sample_id", metadata_group]], on="Sample_id", how="left") df = ( df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] .mean() diff --git a/krakenparser/kpplot/clustermap.py b/krakenparser/kpplot/clustermap.py index f0ad5bb..704bb73 100644 --- a/krakenparser/kpplot/clustermap.py +++ b/krakenparser/kpplot/clustermap.py @@ -1,7 +1,9 @@ -import pandas as pd +from typing import List, Optional, Tuple + import matplotlib.pyplot as plt +import pandas as pd import seaborn as sns -from typing import Optional, Tuple, Union, List + from .base import KpPlotBase, aggregate_by_metadata diff --git a/krakenparser/kpplot/stackedbar.py b/krakenparser/kpplot/stackedbar.py index 8140437..f86cbcc 100644 --- a/krakenparser/kpplot/stackedbar.py +++ b/krakenparser/kpplot/stackedbar.py @@ -1,8 +1,10 @@ -import pandas as pd +from typing import List, Optional, Tuple, Union + import matplotlib.pyplot as plt -import seaborn as sns import numpy as np -from typing import Optional, Tuple, Union, List +import pandas as pd +import seaborn as sns + from .base import KpPlotBase, aggregate_by_metadata diff --git a/krakenparser/kpplot/streamgraph.py b/krakenparser/kpplot/streamgraph.py index 6c389a2..f858be0 100644 --- a/krakenparser/kpplot/streamgraph.py +++ b/krakenparser/kpplot/streamgraph.py @@ -1,8 +1,10 @@ -import pandas as pd +from typing import List, Optional, Tuple, Union + import matplotlib.pyplot as plt -import seaborn as sns import numpy as np -from typing import Optional, Tuple, Union, List +import pandas as pd +import seaborn as sns + from .base import KpPlotBase, aggregate_by_metadata diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index 6a560ce..28e94bf 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -1,9 +1,11 @@ import argparse import logging import subprocess -from pathlib import Path import sys -from importlib.metadata import version as _pkg_version, PackageNotFoundError as _PNF +from importlib.metadata import PackageNotFoundError as _PNF +from importlib.metadata import version as _pkg_version +from pathlib import Path + try: __version__ = _pkg_version("krakenparser") except _PNF: @@ -78,15 +80,18 @@ def main(): # Map flags to (script_path, base_args_to_prepend) command_map = { - "complete": (package_dir / "pipeline.py", []), - "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []), - "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []), - "deconstruct": (package_dir / "counts" / "split_mpa.py", []), - "deconstruct_viruses":(package_dir / "counts" / "split_mpa.py", ["--viruses-only"]), - "process": (package_dir / "counts" / "processing_script.py", []), - "txt2csv": (package_dir / "counts" / "convert2csv.py", []), - "relabund": (package_dir / "stats" / "relabund.py", []), - "diversity": (package_dir / "stats" / "diversity.py", []), + "complete": (package_dir / "pipeline.py", []), + "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []), + "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []), + "deconstruct": (package_dir / "counts" / "split_mpa.py", []), + "deconstruct_viruses": ( + package_dir / "counts" / "split_mpa.py", + ["--viruses-only"], + ), + "process": (package_dir / "counts" / "processing_script.py", []), + "txt2csv": (package_dir / "counts" / "convert2csv.py", []), + "relabund": (package_dir / "stats" / "relabund.py", []), + "diversity": (package_dir / "stats" / "diversity.py", []), } if "-h" in sys.argv or "--help" in sys.argv: @@ -94,7 +99,9 @@ def main(): parser.print_help() return - def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list[str]: + def _build_cmd( + script: Path, base_args: list[str], user_args: list[str] + ) -> list[str]: if script.suffix == ".py": # Run as module (-m) so the krakenparser package stays importable. # Derive dotted module name from path relative to the package root. @@ -113,7 +120,9 @@ def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list # Default to full pipeline when -i/--input is given without a subcommand if "-i" in extra_args or "--input" in extra_args: complete_script, complete_base = command_map["complete"] - subprocess.run(_build_cmd(complete_script, complete_base, extra_args), check=True) + subprocess.run( + _build_cmd(complete_script, complete_base, extra_args), check=True + ) return parser.print_help() diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index f66e63b..c808383 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -53,13 +53,18 @@ def main() -> None: description="Combine MPA files into a single tab-delimited table." ) parser.add_argument( - "-i", "--input", - required=True, nargs="+", dest="in_files", + "-i", + "--input", + required=True, + nargs="+", + dest="in_files", help="Input MPA files (one per sample)", ) parser.add_argument( - "-o", "--output", - required=True, dest="o_file", + "-o", + "--output", + required=True, + dest="o_file", help="Output merged MPA file", ) args = parser.parse_args() diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index 480a1f9..a28cde6 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -8,8 +8,14 @@ # Maps Kraken2 single-letter rank codes to MPA prefixes _RANK_PREFIX = { - "D": "d", "K": "k", "P": "p", "C": "c", - "O": "o", "F": "f", "G": "g", "S": "s", + "D": "d", + "K": "k", + "P": "p", + "C": "c", + "O": "o", + "F": "f", + "G": "g", + "S": "s", } @@ -96,9 +102,7 @@ def kreport_to_mpa( # Build the full MPA path; omit intermediate (x__) segments when not requested path = "|".join( - seg - for (_, seg, std) in stack - if include_intermediate or std + seg for (_, seg, std) in stack if include_intermediate or std ) value = str(cum_reads) if use_reads else str(pct) @@ -112,55 +116,71 @@ def main() -> None: mode = parser.add_mutually_exclusive_group(required=True) mode.add_argument( - "-r", "--report-file", "--report", + "-r", + "--report-file", + "--report", dest="r_file", help="Single input Kraken2 report file", ) mode.add_argument( - "-i", "--input", + "-i", + "--input", dest="input_dir", help="Input directory containing Kraken2 report files (batch mode)", ) parser.add_argument( - "-o", "--output", - required=True, dest="o_file", + "-o", + "--output", + required=True, + dest="o_file", help="Output MPA file (single mode) or output directory (batch mode)", ) parser.add_argument( "--display-header", - action="store_true", dest="add_header", default=False, + action="store_true", + dest="add_header", + default=False, help="Write a header line with the sample name (filename)", ) parser.add_argument( "--read_count", - action="store_true", dest="use_reads", default=True, + action="store_true", + dest="use_reads", + default=True, help="Output clade read counts [default]", ) parser.add_argument( "--percentages", - action="store_false", dest="use_reads", + action="store_false", + dest="use_reads", help="Output percentages instead of read counts", ) parser.add_argument( "--intermediate-ranks", - action="store_true", dest="x_include", default=False, + action="store_true", + dest="x_include", + default=False, help="Include non-standard taxonomic ranks in output", ) parser.add_argument( "--no-intermediate-ranks", - action="store_false", dest="x_include", + action="store_false", + dest="x_include", help="Exclude non-standard taxonomic ranks [default]", ) group = parser.add_mutually_exclusive_group() group.add_argument( "--remove-spaces", - action="store_true", dest="remove_spaces", default=True, + action="store_true", + dest="remove_spaces", + default=True, help="Replace spaces with underscores in taxon names [default]", ) group.add_argument( "--keep-spaces", - action="store_false", dest="remove_spaces", + action="store_false", + dest="remove_spaces", help="Keep spaces in taxon names", ) args = parser.parse_args() diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index 02ca6a7..c54b4e9 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -11,13 +11,13 @@ import pandas as pd -from krakenparser.mpa.transform2mpa import kreport_to_mpa -from krakenparser.mpa.mpa_table import combine_mpa -from krakenparser.counts.split_mpa import split_mpa -from krakenparser.counts.processing_script import process_files from krakenparser.counts.convert2csv import convert_to_csv -from krakenparser.stats.relabund import calculate_rel_abund +from krakenparser.counts.processing_script import process_files +from krakenparser.counts.split_mpa import split_mpa +from krakenparser.mpa.mpa_table import combine_mpa +from krakenparser.mpa.transform2mpa import kreport_to_mpa from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div +from krakenparser.stats.relabund import calculate_rel_abund def _is_processable(path: Path) -> bool: @@ -124,36 +124,49 @@ def run_pipeline( def main() -> None: logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Run the full KrakenParser pipeline." - ) + parser = argparse.ArgumentParser(description="Run the full KrakenParser pipeline.") parser.add_argument( - "-i", "--input", required=True, + "-i", + "--input", + required=True, help="Directory containing Kraken2 report files", ) parser.add_argument( - "-o", "--output", default=None, + "-o", + "--output", + default=None, help="Output directory (default: parent of input)", ) parser.add_argument( - "--keep-human", action="store_true", default=False, + "--keep-human", + action="store_true", + default=False, help="Do not filter human-related taxa (default: filtered)", ) parser.add_argument( - "-d", "--depth", type=int, default=1000, + "-d", + "--depth", + type=int, + default=1000, help="Rarefaction depth for β-diversity (default: 1000)", ) parser.add_argument( - "-s", "--seed", type=int, default=None, + "-s", + "--seed", + type=int, + default=None, help="Random seed for reproducible rarefaction (default: random)", ) parser.add_argument( - "--overwrite", action="store_true", default=False, + "--overwrite", + action="store_true", + default=False, help="Overwrite the output directory if it already exists", ) args = parser.parse_args() run_pipeline( - args.input, args.output, + args.input, + args.output, keep_human=args.keep_human, rarefaction_depth=args.depth, seed=args.seed, diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index ac3c87f..dd76098 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -34,7 +34,9 @@ def chao1_index(counts): return S_obs + (F1 * F1) / (2 * F2) -def _subsample_counts(counts: np.ndarray, n: int, rng: np.random.Generator) -> np.ndarray: +def _subsample_counts( + counts: np.ndarray, n: int, rng: np.random.Generator +) -> np.ndarray: """Rarefy counts to n reads by sampling without replacement.""" indices = np.repeat(np.arange(len(counts)), counts) sampled = rng.choice(indices, size=n, replace=False) @@ -76,11 +78,13 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): bray_df = pd.DataFrame( squareform(pdist(X, metric="braycurtis")), - index=sample_ids, columns=sample_ids, + index=sample_ids, + columns=sample_ids, ) jaccard_df = pd.DataFrame( squareform(pdist(X.astype(bool).astype(float), metric="jaccard")), - index=sample_ids, columns=sample_ids, + index=sample_ids, + columns=sample_ids, ) bray_df.to_csv(output_path / "beta_div_bray.csv") @@ -89,14 +93,27 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") - parser.add_argument("-i", "--input", required=True, - help="Input total count table CSV (species level).") - parser.add_argument("-o", "--output", required=True, - help="Output directory path.") - parser.add_argument("-d", "--depth", type=int, default=1000, - help="Rarefaction depth for β diversity (default: 1000).") - parser.add_argument("-s", "--seed", type=int, default=None, - help="Random seed for reproducible rarefaction (default: random).") + parser.add_argument( + "-i", + "--input", + required=True, + help="Input total count table CSV (species level).", + ) + parser.add_argument("-o", "--output", required=True, help="Output directory path.") + parser.add_argument( + "-d", + "--depth", + type=int, + default=1000, + help="Rarefaction depth for β diversity (default: 1000).", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=None, + help="Random seed for reproducible rarefaction (default: random).", + ) args = parser.parse_args() input_file = Path(args.input) diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index 171f280..5b1b0c5 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -1,9 +1,10 @@ #!/usr/bin/env python +import argparse import logging import warnings -import argparse from pathlib import Path + import pandas as pd _log = logging.getLogger(__name__) @@ -59,9 +60,7 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): # Save to CSV result.to_csv(output_file, index=False) - _log.info( - "Relative abundance saved as '%s'.", output_file - ) + _log.info("Relative abundance saved as '%s'.", output_file) if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index 8100a3e..f681d4f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,10 @@ import matplotlib + matplotlib.use("Agg") import pandas as pd import pytest - SAMPLE_KREPORT = ( "99.98\t999980\t0\tR\t1\troot\n" "99.98\t999980\t0\tD\t2\t Bacteria\n" @@ -43,12 +43,14 @@ def counts_txt_file(tmp_path): @pytest.fixture def counts_csv_file(tmp_path): - df = pd.DataFrame({ - "Sample_id": ["S1", "S2"], - "Pseudomonas aeruginosa": [300000, 100000], - "Escherichia coli": [200000, 50000], - "Bacteroides fragilis": [100000, 200000], - }) + df = pd.DataFrame( + { + "Sample_id": ["S1", "S2"], + "Pseudomonas aeruginosa": [300000, 100000], + "Escherichia coli": [200000, 50000], + "Bacteroides fragilis": [100000, 200000], + } + ) f = tmp_path / "counts_species.csv" df.to_csv(f, index=False) return f @@ -56,11 +58,17 @@ def counts_csv_file(tmp_path): @pytest.fixture def relabund_df(): - return pd.DataFrame({ - "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"], - "taxon": [ - "Pseudomonadota", "Bacillota", "Other (<4.0%)", - "Pseudomonadota", "Bacillota", "Other (<4.0%)", - ], - "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0], - }) + return pd.DataFrame( + { + "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"], + "taxon": [ + "Pseudomonadota", + "Bacillota", + "Other (<4.0%)", + "Pseudomonadota", + "Bacillota", + "Other (<4.0%)", + ], + "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0], + } + ) diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index 599f625..efa6854 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -1,8 +1,9 @@ -import zipfile import shutil -import pytest +import zipfile from pathlib import Path +import pytest + from krakenparser.pipeline import run_pipeline diff --git a/tests/test_integration.py b/tests/test_integration.py index 7fed2fb..35913d3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,13 +6,12 @@ import pandas as pd import pytest -from krakenparser.mpa.transform2mpa import kreport_to_mpa from krakenparser.counts.convert2csv import convert_to_csv from krakenparser.counts.processing_script import process_files from krakenparser.counts.split_mpa import split_mpa -from krakenparser.stats.relabund import calculate_rel_abund +from krakenparser.mpa.transform2mpa import kreport_to_mpa from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div - +from krakenparser.stats.relabund import calculate_rel_abund SAMPLE_COMBINED_MPA = ( "#Classification\tsample1\tsample2\n" @@ -37,6 +36,7 @@ def combined_mpa_file(tmp_path): # Helpers # --------------------------------------------------------------------------- + def _sha256(path) -> str: return hashlib.sha256(path.read_bytes()).hexdigest() @@ -45,6 +45,7 @@ def _sha256(path) -> str: # kreport_to_mpa # --------------------------------------------------------------------------- + def test_kreport_to_mpa_reproducible(kreport_file, tmp_path): counter = itertools.count() @@ -99,6 +100,7 @@ def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path): # convert_to_csv # --------------------------------------------------------------------------- + def test_convert_to_csv_reproducible(counts_txt_file, tmp_path): counter = itertools.count() @@ -124,6 +126,7 @@ def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path): # process_files # --------------------------------------------------------------------------- + def test_process_files_adds_header_and_cleans_names(tmp_path): source = tmp_path / "COMBINED.txt" source.write_text( @@ -132,8 +135,7 @@ def test_process_files_adds_header_and_cleans_names(tmp_path): ) dest = tmp_path / "counts_species.txt" dest.write_text( - "s__Pseudomonas_aeruginosa\t300\t100\n" - "s__Escherichia_coli\t200\t50\n" + "s__Pseudomonas_aeruginosa\t300\t100\ns__Escherichia_coli\t200\t50\n" ) process_files(str(source), str(dest)) result = dest.read_text() @@ -151,7 +153,9 @@ def test_process_files_reproducible(tmp_path): dest = tmp_path / f"counts_{i}.txt" dest.write_text("s__Some_species\t10\n") process_files(str(source), str(dest)) - assert (tmp_path / "counts_0.txt").read_text() == (tmp_path / "counts_1.txt").read_text() + assert (tmp_path / "counts_0.txt").read_text() == ( + tmp_path / "counts_1.txt" + ).read_text() def test_process_files_missing_source_raises(tmp_path): @@ -177,6 +181,7 @@ def test_convert_to_csv_missing_input_raises(tmp_path): # calculate_rel_abund # --------------------------------------------------------------------------- + def test_relabund_reproducible(counts_csv_file, tmp_path): counter = itertools.count() @@ -220,6 +225,7 @@ def test_relabund_missing_input_raises(tmp_path): # calc_alpha_div # --------------------------------------------------------------------------- + def test_alpha_div_reproducible(counts_csv_file, tmp_path): df = pd.read_csv(counts_csv_file, index_col=0) counter = itertools.count() @@ -256,6 +262,7 @@ def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path): # calc_beta_div # --------------------------------------------------------------------------- + def test_beta_div_output_files_exist(counts_csv_file, tmp_path): df = pd.read_csv(counts_csv_file, index_col=0) out_dir = tmp_path / "diversity" @@ -281,13 +288,12 @@ def test_beta_div_diagonal_is_zero(counts_csv_file, tmp_path): calc_beta_div(df, out_dir, rarefaction_depth=1000) bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0) import numpy as np + assert np.allclose(np.diag(bray.values), 0.0) def test_beta_div_too_few_samples_raises(tmp_path): - df = pd.DataFrame( - {"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"] - ) + df = pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]) out_dir = tmp_path / "diversity" out_dir.mkdir() with pytest.raises(ValueError, match="rarefaction"): @@ -298,6 +304,7 @@ def test_beta_div_too_few_samples_raises(tmp_path): # split_mpa # --------------------------------------------------------------------------- + def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path): split_mpa(str(combined_mpa_file), str(tmp_path)) for rank in ("species", "genus", "family", "order", "class", "phylum"): diff --git a/tests/test_kpplot.py b/tests/test_kpplot.py index 51cb5e1..a85f74d 100644 --- a/tests/test_kpplot.py +++ b/tests/test_kpplot.py @@ -2,16 +2,16 @@ import pytest +from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata +from krakenparser.kpplot.clustermap import clustermap from krakenparser.kpplot.stackedbar import stacked_barplot from krakenparser.kpplot.streamgraph import streamgraph -from krakenparser.kpplot.clustermap import clustermap -from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata - # --------------------------------------------------------------------------- # Smoke tests — verify each plot function returns without error # --------------------------------------------------------------------------- + def test_stackedbar_returns_kpplotbase(relabund_df): result = stacked_barplot(relabund_df) assert isinstance(result, KpPlotBase) @@ -31,6 +31,7 @@ def test_clustermap_returns_kpplotbase(relabund_df): # sample_order validation # --------------------------------------------------------------------------- + def test_stackedbar_sample_order_missing_raises(relabund_df): with pytest.raises(ValueError, match="Samples missing"): stacked_barplot(relabund_df, sample_order=["S1", "S2", "GHOST"]) @@ -50,6 +51,7 @@ def test_clustermap_sample_order_missing_raises(relabund_df): # cmap validation (stackedbar / streamgraph) # --------------------------------------------------------------------------- + def test_stackedbar_cmap_too_short_raises(relabund_df): with pytest.raises(ValueError, match="cmap"): stacked_barplot(relabund_df, cmap=["red"]) @@ -74,13 +76,16 @@ def test_streamgraph_cmap_invalid_type_raises(relabund_df): # aggregate_by_metadata # --------------------------------------------------------------------------- + def test_aggregate_by_metadata_basic(relabund_df): import pandas as pd - metadata = pd.DataFrame({ - "Sample_id": ["S1", "S2"], - "Group": ["A", "A"], - }) + metadata = pd.DataFrame( + { + "Sample_id": ["S1", "S2"], + "Group": ["A", "A"], + } + ) result = aggregate_by_metadata(relabund_df, metadata, "Group") assert "Sample_id" in result.columns assert set(result["Sample_id"]) == {"A"} diff --git a/tests/test_units.py b/tests/test_units.py index 57618de..39435d3 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -2,23 +2,22 @@ import math -import numpy as np import pytest +from krakenparser.counts.processing_script import modify_taxa_names from krakenparser.mpa.transform2mpa import _parse_line from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index -from krakenparser.counts.processing_script import modify_taxa_names - # --------------------------------------------------------------------------- # _parse_line # --------------------------------------------------------------------------- + def test_parse_line_standard_rank(): line = "50.00\t500000\t100000\tP\t1224\t Pseudomonadota\n" name, depth, rank, cum_reads, pct = _parse_line(line) assert name == "Pseudomonadota" - assert depth == 2 # 4 leading spaces // 2 + assert depth == 2 # 4 leading spaces // 2 assert rank == "P" assert cum_reads == 500000 assert pct == 50.0 @@ -37,7 +36,7 @@ def test_parse_line_intermediate_rank(): name, depth, rank, _, _ = _parse_line(line) assert name == "Some subspecies" assert rank == "S1" - assert depth == 5 # 10 spaces // 2 + assert depth == 5 # 10 spaces // 2 def test_parse_line_too_few_columns(): @@ -56,6 +55,7 @@ def test_parse_line_non_numeric_reads(): # shannon_index # --------------------------------------------------------------------------- + def test_shannon_uniform_four_species(): assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10 @@ -76,6 +76,7 @@ def test_shannon_two_equal_species(): # pielou_evenness # --------------------------------------------------------------------------- + def test_pielou_single_species_returns_nan(): assert math.isnan(pielou_evenness([100])) @@ -97,6 +98,7 @@ def test_pielou_range_zero_to_one(): # chao1_index # --------------------------------------------------------------------------- + def test_chao1_f2_zero_uses_f1_formula(): # F1=3, F2=0 → S_obs + F1*(F1-1)/2 counts = [1, 1, 1, 5, 10] # F1=3, F2=0, S_obs=5 @@ -121,6 +123,7 @@ def test_chao1_no_singletons(): # modify_taxa_names # --------------------------------------------------------------------------- + def test_modify_taxa_names_strips_prefix_and_replaces_underscores(): assert modify_taxa_names("s__Homo_sapiens\t100\t200") == "Homo sapiens\t100\t200" From 8e3c0000c0d0510fdbe24f6e27938d347600b2ee Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 14:46:11 +0200 Subject: [PATCH 2/6] upd ver --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7af15d5..4ae9bca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "krakenparser" -version = "1.0.0" +version = "1.0.1" description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format." readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} From 34616e59350e2c05d5f14b96c3a168b6632e9c31 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 15:02:15 +0200 Subject: [PATCH 3/6] ci: replace flake8 with Ruff linter and formatter --- .github/workflows/python-package.yml | 55 +++++++++++++--------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 553faa6..7a993e1 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,46 +1,41 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - name: Python package - on: push: branches: [ "dev", "main" ] pull_request: branches: [ "dev", "main" ] - jobs: build: - runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 setuptools wheel - pip install -e ".[dev]" --no-build-isolation - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest --cov=krakenparser --cov-report=xml - - name: Upload coverage to Codecov - if: matrix.python-version == '3.12' - uses: codecov/codecov-action@v5 - with: - files: coverage.xml - token: ${{ secrets.CODECOV_TOKEN }} + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ruff setuptools wheel + pip install -e ".[dev]" --no-build-isolation + - name: Lint with Ruff + run: | + ruff check . + - name: Format check with Ruff + run: | + ruff format --check . + - name: Test with pytest + run: | + pytest --cov=krakenparser --cov-report=xml + - name: Upload coverage to Codecov + if: matrix.python-version == '3.12' + uses: codecov/codecov-action@v5 + with: + files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file From 3f6f0b47de7248b2b84217a339cda153723ff2bd Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 15:02:24 +0200 Subject: [PATCH 4/6] ci: add codecov.yml with informational patch coverage --- codecov.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..bc81794 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,6 @@ +# codecov.yml +coverage: + patch: + target: 78% # снизить порог + # или: + informational: true # не фейлить, только информировать \ No newline at end of file From bc0f1e579f7c061681ad3bb43024714f11ea9710 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 15:05:11 +0200 Subject: [PATCH 5/6] style(ci): remove comments from workflow file --- codecov.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/codecov.yml b/codecov.yml index bc81794..4200a1c 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,6 +1,4 @@ -# codecov.yml coverage: patch: - target: 78% # снизить порог - # или: - informational: true # не фейлить, только информировать \ No newline at end of file + target: 78% + informational: true \ No newline at end of file From 4dc34dc4289b34ad0eebe82ddaeb6e8f2c224507 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 18 May 2026 15:05:52 +0200 Subject: [PATCH 6/6] revert: replace flake8 with Ruff linter and formatter --- .github/workflows/python-package.yml | 55 +++++++++++++++------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7a993e1..553faa6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,41 +1,46 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + name: Python package + on: push: branches: [ "dev", "main" ] pull_request: branches: [ "dev", "main" ] + jobs: build: + runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install ruff setuptools wheel - pip install -e ".[dev]" --no-build-isolation - - name: Lint with Ruff - run: | - ruff check . - - name: Format check with Ruff - run: | - ruff format --check . - - name: Test with pytest - run: | - pytest --cov=krakenparser --cov-report=xml - - name: Upload coverage to Codecov - if: matrix.python-version == '3.12' - uses: codecov/codecov-action@v5 - with: - files: coverage.xml - token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 setuptools wheel + pip install -e ".[dev]" --no-build-isolation + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov=krakenparser --cov-report=xml + - name: Upload coverage to Codecov + if: matrix.python-version == '3.12' + uses: codecov/codecov-action@v5 + with: + files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }}