Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
coverage:
patch:
target: 78%
informational: true
2 changes: 1 addition & 1 deletion krakenparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .kpplot.clustermap import clustermap
from .kpplot.stackedbar import stacked_barplot
from .kpplot.streamgraph import streamgraph
from .kpplot.clustermap import clustermap

__all__ = [
"stacked_barplot",
Expand Down
1 change: 1 addition & 0 deletions krakenparser/counts/convert2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import logging
from pathlib import Path

import pandas as pd

_log = logging.getLogger(__name__)
Expand Down
5 changes: 2 additions & 3 deletions krakenparser/counts/processing_script.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
#!/usr/bin/env python

import argparse
import os
import sys
import tempfile
import argparse
from pathlib import Path


def modify_taxa_names(line):
prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
for prefix in prefixes:
if line.startswith(prefix):
parts = line[len(prefix):].split("\t")
parts = line[len(prefix) :].split("\t")
parts[0] = parts[0].replace("_", " ")
return "\t".join(parts)
return line
Expand Down
30 changes: 17 additions & 13 deletions krakenparser/counts/split_mpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,27 @@
import argparse
import logging
import re
import sys
from pathlib import Path

_log = logging.getLogger(__name__)


_RANKS = [
("species", "s__", []),
("genus", "g__", ["s__"]),
("family", "f__", ["s__", "g__"]),
("order", "o__", ["s__", "g__", "f__"]),
("class", "c__", ["s__", "g__", "f__", "o__"]),
("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]),
("genus", "g__", ["s__"]),
("family", "f__", ["s__", "g__"]),
("order", "o__", ["s__", "g__", "f__"]),
("class", "c__", ["s__", "g__", "f__", "o__"]),
("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]),
]

_HUMAN_TAXA = {
"species": "s__Homo_sapiens",
"genus": "g__Homo",
"family": "f__Hominidae",
"order": "o__Primates",
"class": "c__Mammalia",
"phylum": "p__Chordata",
"genus": "g__Homo",
"family": "f__Hominidae",
"order": "o__Primates",
"class": "c__Mammalia",
"phylum": "p__Chordata",
}

_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-")
Expand All @@ -41,7 +40,7 @@ def _strip_path_prefix(line: str) -> str:
return line
path, rest = line[:tab], line[tab:]
pipe = path.rfind("|")
segment = path[pipe + 1:] if pipe != -1 else path
segment = path[pipe + 1 :] if pipe != -1 else path
return _ACCESSION_RE.sub("", segment + rest)


Expand Down Expand Up @@ -105,7 +104,12 @@ def main() -> None:
help="Do not filter human-related taxa (default: filtered)",
)
args = parser.parse_args()
split_mpa(args.input, args.output, viruses_only=args.viruses_only, keep_human=args.keep_human)
split_mpa(
args.input,
args.output,
viruses_only=args.viruses_only,
keep_human=args.keep_human,
)


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions krakenparser/kpplot/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional

import matplotlib.pyplot as plt
import pandas as pd
from typing import Optional


class KpPlotBase:
Expand Down Expand Up @@ -44,9 +45,7 @@ def aggregate_by_metadata(
raise ValueError("metadata must contain 'Sample_id' column")
if metadata_group not in metadata.columns:
raise ValueError(f"'{metadata_group}' column not found in metadata")
df = df.merge(
metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
)
df = df.merge(metadata[["Sample_id", metadata_group]], on="Sample_id", how="left")
df = (
df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
.mean()
Expand Down
6 changes: 4 additions & 2 deletions krakenparser/kpplot/clustermap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pandas as pd
from typing import List, Optional, Tuple

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from typing import Optional, Tuple, Union, List

from .base import KpPlotBase, aggregate_by_metadata


Expand Down
8 changes: 5 additions & 3 deletions krakenparser/kpplot/stackedbar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import pandas as pd
from typing import List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Optional, Tuple, Union, List
import pandas as pd
import seaborn as sns

from .base import KpPlotBase, aggregate_by_metadata


Expand Down
8 changes: 5 additions & 3 deletions krakenparser/kpplot/streamgraph.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import pandas as pd
from typing import List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Optional, Tuple, Union, List
import pandas as pd
import seaborn as sns

from .base import KpPlotBase, aggregate_by_metadata


Expand Down
35 changes: 22 additions & 13 deletions krakenparser/krakenparser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import logging
import subprocess
from pathlib import Path
import sys
from importlib.metadata import version as _pkg_version, PackageNotFoundError as _PNF
from importlib.metadata import PackageNotFoundError as _PNF
from importlib.metadata import version as _pkg_version
from pathlib import Path

try:
__version__ = _pkg_version("krakenparser")
except _PNF:
Expand Down Expand Up @@ -78,23 +80,28 @@ def main():

# Map flags to (script_path, base_args_to_prepend)
command_map = {
"complete": (package_dir / "pipeline.py", []),
"kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []),
"combine_mpa": (package_dir / "mpa" / "mpa_table.py", []),
"deconstruct": (package_dir / "counts" / "split_mpa.py", []),
"deconstruct_viruses":(package_dir / "counts" / "split_mpa.py", ["--viruses-only"]),
"process": (package_dir / "counts" / "processing_script.py", []),
"txt2csv": (package_dir / "counts" / "convert2csv.py", []),
"relabund": (package_dir / "stats" / "relabund.py", []),
"diversity": (package_dir / "stats" / "diversity.py", []),
"complete": (package_dir / "pipeline.py", []),
"kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []),
"combine_mpa": (package_dir / "mpa" / "mpa_table.py", []),
"deconstruct": (package_dir / "counts" / "split_mpa.py", []),
"deconstruct_viruses": (
package_dir / "counts" / "split_mpa.py",
["--viruses-only"],
),
"process": (package_dir / "counts" / "processing_script.py", []),
"txt2csv": (package_dir / "counts" / "convert2csv.py", []),
"relabund": (package_dir / "stats" / "relabund.py", []),
"diversity": (package_dir / "stats" / "diversity.py", []),
}

if "-h" in sys.argv or "--help" in sys.argv:
if not any(getattr(args, key) for key in command_map):
parser.print_help()
return

def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list[str]:
def _build_cmd(
script: Path, base_args: list[str], user_args: list[str]
) -> list[str]:
if script.suffix == ".py":
# Run as module (-m) so the krakenparser package stays importable.
# Derive dotted module name from path relative to the package root.
Expand All @@ -113,7 +120,9 @@ def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list
# Default to full pipeline when -i/--input is given without a subcommand
if "-i" in extra_args or "--input" in extra_args:
complete_script, complete_base = command_map["complete"]
subprocess.run(_build_cmd(complete_script, complete_base, extra_args), check=True)
subprocess.run(
_build_cmd(complete_script, complete_base, extra_args), check=True
)
return

parser.print_help()
Expand Down
13 changes: 9 additions & 4 deletions krakenparser/mpa/mpa_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,18 @@ def main() -> None:
description="Combine MPA files into a single tab-delimited table."
)
parser.add_argument(
"-i", "--input",
required=True, nargs="+", dest="in_files",
"-i",
"--input",
required=True,
nargs="+",
dest="in_files",
help="Input MPA files (one per sample)",
)
parser.add_argument(
"-o", "--output",
required=True, dest="o_file",
"-o",
"--output",
required=True,
dest="o_file",
help="Output merged MPA file",
)
args = parser.parse_args()
Expand Down
52 changes: 36 additions & 16 deletions krakenparser/mpa/transform2mpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@

# Maps Kraken2 single-letter rank codes to MPA prefixes
_RANK_PREFIX = {
"D": "d", "K": "k", "P": "p", "C": "c",
"O": "o", "F": "f", "G": "g", "S": "s",
"D": "d",
"K": "k",
"P": "p",
"C": "c",
"O": "o",
"F": "f",
"G": "g",
"S": "s",
}


Expand Down Expand Up @@ -96,9 +102,7 @@ def kreport_to_mpa(

# Build the full MPA path; omit intermediate (x__) segments when not requested
path = "|".join(
seg
for (_, seg, std) in stack
if include_intermediate or std
seg for (_, seg, std) in stack if include_intermediate or std
)

value = str(cum_reads) if use_reads else str(pct)
Expand All @@ -112,55 +116,71 @@ def main() -> None:

mode = parser.add_mutually_exclusive_group(required=True)
mode.add_argument(
"-r", "--report-file", "--report",
"-r",
"--report-file",
"--report",
dest="r_file",
help="Single input Kraken2 report file",
)
mode.add_argument(
"-i", "--input",
"-i",
"--input",
dest="input_dir",
help="Input directory containing Kraken2 report files (batch mode)",
)

parser.add_argument(
"-o", "--output",
required=True, dest="o_file",
"-o",
"--output",
required=True,
dest="o_file",
help="Output MPA file (single mode) or output directory (batch mode)",
)
parser.add_argument(
"--display-header",
action="store_true", dest="add_header", default=False,
action="store_true",
dest="add_header",
default=False,
help="Write a header line with the sample name (filename)",
)
parser.add_argument(
"--read_count",
action="store_true", dest="use_reads", default=True,
action="store_true",
dest="use_reads",
default=True,
help="Output clade read counts [default]",
)
parser.add_argument(
"--percentages",
action="store_false", dest="use_reads",
action="store_false",
dest="use_reads",
help="Output percentages instead of read counts",
)
parser.add_argument(
"--intermediate-ranks",
action="store_true", dest="x_include", default=False,
action="store_true",
dest="x_include",
default=False,
help="Include non-standard taxonomic ranks in output",
)
parser.add_argument(
"--no-intermediate-ranks",
action="store_false", dest="x_include",
action="store_false",
dest="x_include",
help="Exclude non-standard taxonomic ranks [default]",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--remove-spaces",
action="store_true", dest="remove_spaces", default=True,
action="store_true",
dest="remove_spaces",
default=True,
help="Replace spaces with underscores in taxon names [default]",
)
group.add_argument(
"--keep-spaces",
action="store_false", dest="remove_spaces",
action="store_false",
dest="remove_spaces",
help="Keep spaces in taxon names",
)
args = parser.parse_args()
Expand Down
Loading
Loading