Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![master](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml)
[![devel](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml/badge.svg?branch=devel)](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml)
[![main](https://img.shields.io/github/actions/workflow/status/kuijjerlab/sponge/test.yaml?branch=main&label=main)](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml)
[![devel](https://img.shields.io/github/actions/workflow/status/kuijjerlab/sponge/test.yaml?branch=devel&label=devel)](https://github.com/kuijjerlab/sponge/actions/workflows/test.yaml)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)


Expand Down Expand Up @@ -247,7 +247,7 @@ In HPC environments, something like the `apptainer shell` command would
work.

Because of the libraries used for bigbed format support, SPONGE is not
currently supported on Windows.
currently supported on Windows.
Therefore, this container is probably the best way to run it there,
and the command equivalent to the above in the command prompt would look
like this:
Expand All @@ -263,7 +263,8 @@ The project is: _in progress_.

## Room for Improvement
Room for improvement:
- Better tests
- More test cases
- Tests with expected failure

To do:
- Support for more species
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ classifiers = [
"Programming Language :: Python :: 3",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
dependencies = [
dependencies = [
"bioframe>=0.8.0,<0.9",
"matplotlib>=3.10.8,<4",
"numpy>=1.19.0,<3",
"pandas>=2.0.0,<3",
"pybbi>=0.4.0,<0.5",
"pyjaspar>=4.0.0,<5",
"scikit-learn>=1.7.2,<2",
"setuptools-scm>=8.3.0,<9",
"tqdm>=4.67.1,<5",
]

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ pandas
pybbi
pyjaspar
scikit-learn
setuptools-scm
tqdm
14 changes: 10 additions & 4 deletions src/sponge/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand All @@ -22,12 +22,13 @@
from argparse import (ArgumentDefaultsHelpFormatter, ArgumentParser,
RawDescriptionHelpFormatter)
from pathlib import Path
from setuptools_scm import get_version

from .sponge import Sponge

### Class definition ###
class CustomFormatter(
ArgumentDefaultsHelpFormatter,
ArgumentDefaultsHelpFormatter,
RawDescriptionHelpFormatter,
):
"""
Expand All @@ -47,7 +48,7 @@ def cli(
"""

DESCRIPTION = """
SPONGE Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
SPONGE Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
This program comes with ABSOLUTELY NO WARRANTY.
This is free software, and you are welcome to redistribute it under certain conditions.
Please refer to the GPL-3.0 license for more details.
Expand All @@ -70,10 +71,15 @@ def cli(
help='create an example config file with the default values called '
'user_config.yaml and exit',
action='store_true')
parser.add_argument('-v', '--version', dest='show_version',
help='show the current version and exit', action='store_true')

args = parser.parse_args()

if args.example_config:
if args.show_version:
print (get_version(version_scheme='only-version',
local_scheme='no-local-version'))
elif args.example_config:
file_dir = Path(__file__).parents[0]
shutil.copy(os.path.join(file_dir, 'user_config.yaml'),
'user_config.yaml')
Expand Down
2 changes: 1 addition & 1 deletion src/sponge/config_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down
4 changes: 2 additions & 2 deletions src/sponge/modules/analysis/analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down Expand Up @@ -43,7 +43,7 @@ def load_prior(
The processed pandas DataFrame
"""

return pd.read_csv(path, sep='\t', header=None, names=names)
return pd.read_table(path, header=None, names=names)


def describe_prior(
Expand Down
16 changes: 9 additions & 7 deletions src/sponge/modules/data_retriever/data_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand All @@ -17,7 +17,7 @@

### Imports ###
from pathlib import Path
from typing import List
from typing import Optional

from sponge.config_manager import ConfigManager
from sponge.modules.version_logger import VersionLogger
Expand All @@ -33,7 +33,7 @@ def __init__(
temp_folder: Path,
core_config: ConfigManager,
user_config: ConfigManager,
version_logger: VersionLogger,
version_logger: Optional[VersionLogger] = None,
):
"""
Class that retrieves the data required for the running of
Expand All @@ -47,8 +47,9 @@ def __init__(
Core configuration of SPONGE
user_config : ConfigManager
User-provided configuration of SPONGE
version_logger : VersionLogger
Version logger to keep track of the retrieved files
version_logger : Optional[VersionLogger], optional
Version logger to keep track of the retrieved files or None
to not use one, by default None
"""

# JASPAR bigbed file (if appropriate)
Expand All @@ -72,8 +73,9 @@ def __init__(
core_config['default_chromosomes'],
)
# Register the classes for version logging
version_logger.register_class(self.tfbs)
version_logger.register_class(self.regions)
if version_logger is not None:
version_logger.register_class(self.tfbs)
version_logger.register_class(self.regions)


def retrieve_data(
Expand Down
2 changes: 1 addition & 1 deletion src/sponge/modules/data_retriever/file_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down
33 changes: 19 additions & 14 deletions src/sponge/modules/data_retriever/region_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down Expand Up @@ -80,7 +80,7 @@ def __init__(
self.rest = rest_url
self.mapping_url = mapping_url
self.assembly = genome_assembly
self.mapping = default_mapping
self.mapping = pd.Series(default_mapping)

if region_settings['chromosomes'] is None:
region_settings['chromosomes'] = default_chromosomes
Expand All @@ -105,33 +105,38 @@ def _retrieve_region(
Version of the Ensembl database corresponding to the file
"""

# Attempt to retrieve mapping
mapping = get_chromosome_mapping(self.assembly, self.mapping_url)
if mapping is not None:
# Managed to retrieve it, overwrite the default
self.mapping = mapping
# Inverted mapping to apply chromosome filtering
inv_mapping = pd.Series(self.mapping.index, index=self.mapping.values)
# Filter for the required chromosomes
chroms = [inv_mapping[chrom] for chrom in self.settings['chromosomes']]
filter = {'chromosome_name': chroms}
# Attributes to retrieve
attributes = ['ensembl_transcript_id', 'transcript_gencode_basic',
ATTRIBUTES = ['ensembl_transcript_id', 'transcript_gencode_basic',
'chromosome_name', 'transcription_start_site', 'strand',
'ensembl_gene_id', 'external_gene_name', 'gene_biotype']
# Submit and retrieve the response
buffer = retrieve_ensembl_data('hsapiens_gene_ensembl', attributes,
self.xml)
buffer = retrieve_ensembl_data('hsapiens_gene_ensembl', ATTRIBUTES,
self.xml, filter)

# Dictionary of types for conversion from the response, default strings
dtype_dict = defaultdict(lambda: str)
# Change the types that are not strings but integers
dtype_dict['Transcription start site (TSS)'] = int
dtype_dict['Strand'] = int
# Convert the response into a DataFrame
df = pd.read_csv(buffer, sep='\t', dtype=dtype_dict)
df = pd.read_table(buffer, dtype=dtype_dict)

print ('\nFiltering and modifying dataframe...\n')
print ('\nFiltering and modifying dataframe...')
if self.settings['filter_basic']:
# Filter only for GENCODE basic
df = df[df['GENCODE basic annotation'] == 'GENCODE basic'].copy()
df.drop(columns='GENCODE basic annotation', inplace=True)
# Convert chromosome names to match with other inputs
# Attempt to retrieve mapping
mapping = get_chromosome_mapping(self.assembly, self.mapping_url)
if mapping is not None:
# Managed to retrieve it, overwrite the default
self.mapping = mapping
df['Chromosome'] = df['Chromosome/scaffold name'].apply(lambda x:
self.mapping[x] if x in self.mapping else None)
not_mapped = df['Chromosome'].isna().sum()
Expand Down Expand Up @@ -160,11 +165,11 @@ def _retrieve_region(
df.sort_values(['Chromosome', 'Start'], inplace=True)

# Columns to be saved into a file
columns = ['Chromosome', 'Start', 'End', 'Transcript stable ID',
COLUMNS = ['Chromosome', 'Start', 'End', 'Transcript stable ID',
'Gene stable ID', 'Gene name', 'Gene type']
print (f'\nSaving data to: {self.temp_filename}')
# Save the file
self.df = df[columns]
self.df = df[COLUMNS]
self.df.to_csv(self.temp_filename, sep='\t', index=False)

return get_ensembl_version(self.rest)
Expand Down
4 changes: 3 additions & 1 deletion src/sponge/modules/data_retriever/tfbs_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down Expand Up @@ -65,6 +65,8 @@ def __init__(
self.jaspar_release = motif_settings['jaspar_release']
self.genome_assembly = genome_assembly
self.urls = motif_url
if type(self.urls) is not list:
self.urls = [self.urls]

temp_filename = os.path.join(temp_folder, self._default_filename)

Expand Down
12 changes: 7 additions & 5 deletions src/sponge/modules/file_writer/file_writer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand All @@ -19,7 +19,7 @@
import pandas as pd

from pathlib import Path
from typing import Iterable
from typing import Iterable, Union

### Class definition ###
class FileWriter:
Expand All @@ -30,14 +30,14 @@ def __init__(
"""
Class which writes prior networks to files.
"""

pass


def write_network_file(
self,
df: pd.DataFrame,
node_columns: Iterable[str],
node_columns: Union[Iterable[str], str],
weight_column: str,
file_name: Path,
):
Expand All @@ -55,7 +55,9 @@ def write_network_file(
file_name : Path
Where to write the network to
"""


if type(node_columns) == str:
node_columns = [node_columns]
sorted_df = df.sort_values(by=node_columns)
sorted_df[node_columns + [weight_column]].to_csv(
file_name, sep='\t', index=False, header=False)
4 changes: 2 additions & 2 deletions src/sponge/modules/match_aggregator/match_aggregator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down Expand Up @@ -47,7 +47,7 @@ def __init__(
"""

self.initial_edges = edges
self.regions = pd.read_csv(regions_file, sep='\t')
self.regions = pd.read_table(regions_file)
self.homolog_mapping = homolog_mapping
# To be overwritten once retrieved
self.edges = None
Expand Down
2 changes: 1 addition & 1 deletion src/sponge/modules/match_filter/match_filter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down
15 changes: 9 additions & 6 deletions src/sponge/modules/motif_selector/homology_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down Expand Up @@ -112,6 +112,7 @@ def find_homologs(

# Get the non-species motif names
xeno_motif_names = [motif.name for motif in xeno_motifs]
xeno_motif_ids = [motif.matrix_id for motif in xeno_motifs]
# Compare against NCBI homologs
found_names = [adjust_gene_name(motif.name) for motif in xeno_motifs
if not False in [acc in homologs for acc in motif.acc]]
Expand All @@ -129,7 +130,8 @@ def find_homologs(
for motif in xeno_motifs
if not False in [acc in homologs for acc in motif.acc]
}
corr_df = pd.DataFrame(xeno_motif_names, columns=['Original Name'])
corr_df = pd.DataFrame({'Original Name': xeno_motif_names,
'Matrix ID': xeno_motif_ids})
corr_df['Adjusted Name'] = corr_df['Original Name'].apply(
adjust_gene_name)
corr_df['Species Name'] = corr_df['Original Name'].apply(
Expand All @@ -146,10 +148,11 @@ def find_homologs(
print (f'{i}:', to_print.loc[i])

# Calculate the information content for duplicates
duplicated['IC'] = duplicated['Original Name'].apply(lambda x:
max(tf_to_motif[x].values()))
duplicated['IC'] = duplicated.apply(lambda row:
tf_to_motif[row['Original Name']][row['Matrix ID']], axis=1,
result_type='reduce')
# Keep the highest IC amongst the duplicates
to_drop = duplicated['Original Name'][duplicated.sort_values(
to_drop = duplicated['Matrix ID'][duplicated.sort_values(
'IC').duplicated('Species Name', keep='last')]
else:
to_drop = []
Expand All @@ -164,7 +167,7 @@ def find_homologs(
# homologs
corr_df_final = corr_df[(corr_df['Duplicate'] == False) &
(~corr_df['Species Name'].isna()) &
(corr_df['Original Name'].isin(to_drop) == False)]
(corr_df['Matrix ID'].isin(to_drop) == False)]

# The mapping of out-species to in-species names
# and the matrix IDs to be kept
Expand Down
2 changes: 1 addition & 1 deletion src/sponge/modules/motif_selector/jaspar_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2025 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
# Copyright (C) 2026 Ladislav Hovan <ladislav.hovan@ncmbm.uio.no>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand Down
Loading
Loading