Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 37 additions & 37 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions reference-data/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ ComplexFinder(

### Complex Portal

Go the [Complex Portal Website](https://www.ebi.ac.uk/complexportal/home) and download the database (save it as HUMAN_COMPLEX_PORTAL.txt) for the utilized organismn.
Go the [Complex Portal Website](https://www.ebi.ac.uk/complexportal/home) and download the database (save it as HUMAN_COMPLEX_PORTAL.txt) for the utilized organism.


```python
Expand All @@ -40,7 +40,7 @@ ComplexFinder(

### hu.Map 2.0

The hu.MAP 2.0 has recently beend published and is available at this [link](http://humap2.proteincomplexes.org).
The hu.MAP 2.0 has recently been published and is available at this [link](http://humap2.proteincomplexes.org).

```python
ComplexFinder(
Expand Down
232 changes: 116 additions & 116 deletions src/main.py

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions src/modules/Database.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class Database(object):
def __init__(self, nJobs = 4, splitString = ";"):
"""Database Module.

The pipeline requires a database containing positve feature interactions.
The pipeline requires a database containing positive feature interactions.
This module find interactions present in the dataset to be analysed,
creates decoy interactions and matches metrices to databases.

Expand Down Expand Up @@ -115,7 +115,7 @@ def _filterDb(self,
raise ValueError("complexNameColumn not in database")


def pariwiseProteinInteractions(self,
def pairwiseProteinInteractions(self,
complexIDsColumn,
dbID = "20190823_CORUM.txt",
filterDb = {'Organism': ["Human"]},
Expand Down Expand Up @@ -163,7 +163,7 @@ def addDecoy(self, sizeFraction = 1.2):
Adds a decoy database to the module.

Random entries from positive data are taken and Fake
complexes are build. Self-ineractions (x1 == x2) are
complexes are build. Self-interactions (x1 == x2) are
not allowed and ignored. Duplicated interactions are
also ignored as well as positive Interactions that is
reported in a different positive complex.
Expand Down Expand Up @@ -277,7 +277,7 @@ def getInteractionClassByE1E2(self,E1E2s,E1s,E2s):
else:
E1E2Type.append("decoy")
else:
#if we get here, those itneractions cannot be positive or decoy
#if we get here, those interactions cannot be positive or decoy
e1 = E1s[n]
e2 = E2s[n]

Expand Down Expand Up @@ -353,7 +353,7 @@ def _saveFilteredDf(self,fileName):
def collectPairwiseInt(self,i,interactors,complexName,predictClass,splitString = ";"):

collectedResult = []
for interaction in self._getPariwiseInteractions(interactors.split(splitString)):
for interaction in self._getPairwiseInteractions(interactors.split(splitString)):
interaction = [e[:6] for e in interaction]
collectedResult.append({"ComplexID":i,"E1":interaction[0],"E2":interaction[1],"E1E2":''.join(sorted(interaction)),"complexName":complexName,"Class":predictClass})
return collectedResult
Expand All @@ -368,7 +368,7 @@ def _findPositiveInteractions(self,filteredDB, df, dbID, complexNameColumn):
return df


def _getPariwiseInteractions(self,entryList):
def _getPairwiseInteractions(self, entryList):
""
return itertools.combinations(entryList, 2)

Expand Down Expand Up @@ -426,23 +426,23 @@ def findMatch(self,x,metricDf, mCols):
return metricDf.loc[metricDf["E2E1"] == search,mCols]

@property
def indentifiedComplexes(self):
def identifiedComplexes(self):
if hasattr(self,'uniqueComplexesIdentified'):
return self.uniqueComplexesIdentified

def identifiableComplexes(self,complexMemberIds, ID = "20190823_CORUM.txt"):
""
identifiableMebmers = OrderedDict()
identifiableMembers = OrderedDict()
if hasattr(self,'uniqueComplexesIdentified'):
for k in self.uniqueComplexesIdentified.keys():
identifiableMebmers[k] = {}
identifiableMembers[k] = {}
boolIdx = self.dbs[ID].index == k
complexData = self.dbs[ID][boolIdx]
cMembers = complexData[complexMemberIds].tolist()[0].split(";")
identifiableMebmers[k]["n"] = len(cMembers)
identifiableMebmers[k]["members"] = cMembers
identifiableMembers[k]["n"] = len(cMembers)
identifiableMembers[k]["members"] = cMembers

return identifiableMebmers
return identifiableMembers


def assignComplexToProtein(self, e, complexMemberIds, complexIDColumn, ID = "20190823_CORUM.txt", filterDict = {'Organism': ["Human"]}):
Expand Down Expand Up @@ -553,12 +553,12 @@ def matchMetrices(self,pathToTmp,entriesInChunks,metricColumns,analysisName,forc

def _createChunks(self,pathToTmp,entriesInChunks,metricColumns):
"""
Craetes chunks
Creates chunks


To do:

Parellelerize.
Parallelize.

Parameters
----------
Expand Down Expand Up @@ -728,10 +728,10 @@ def matchInteractions(self,columnLabel, distanceMatrix):
def fillComplexMatrixFromData(self, X):
""
if not isinstance(X, pd.DataFrame):
raise ValueError("X must be a pandas data frame with index and columns containg ID")
raise ValueError("X must be a pandas data frame with index and columns containing ID")

return X.merge(self.df,how="left",left_index=True,right_on="E1;E2")


if __name__ == "__main__":
Database().pariwiseProteinInteractions("subunits(UniProt IDs)")
Database().pairwiseProteinInteractions("subunits(UniProt IDs)")
14 changes: 7 additions & 7 deletions src/modules/Distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


def minMaxNorm(X,axis=0):
"Normalize array betweem 0 and 1"
"Normalize array between 0 and 1"
Xmin = np.nanmin(X,axis=axis, keepdims=True)
Xmax = np.nanmax(X,axis=axis,keepdims=True)
X_transformed = (X - Xmin) / (Xmax-Xmin)
Expand Down Expand Up @@ -116,7 +116,7 @@ def _pearson(u,v):

@jit()
def pearson(nY,Ys):
"Calcualtes pearson correlation."
"""Calculates pearson correlation."""
return [_pearson(nY,Y) for Y in Ys]


Expand Down Expand Up @@ -217,16 +217,16 @@ def __init__(self,
Identifier of E1

E2 : obj:`list`of obj `np.array`
Signal intensity of E2s. Disntances
betwenn ID and E2 are calculated.
The intensitiy profiles of E2s are uploaded from source.npy.
Signal intensity of E2s. Distances
between ID and E2 are calculated.
The intensity profiles of E2s are uploaded from source.npy.

ownPeaks : obj:`list`of obj `dict`
List of modelled peaks for Y. Required to calculate apex distance,
which is equal to the euclidean dinstance of the closest peaks.
which is equal to the euclidean distance of the closest peaks.

metrices : obj:`list` of obj:`str` or obj`list` of obj`dict`
List of strings or dictionories of metrices used to calculate distance.
List of strings or dictionaries of metrices used to calculate distance.
If dict is provided, two keys namely `fn`and `name`must be provided.
The name must be unique (if more than one dict is provided.)

Expand Down
14 changes: 7 additions & 7 deletions src/modules/Distance_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,21 @@ def __init__(self,
Identifier of E1

E2 : obj:`list`of obj `np.array`
Signal intensity of E2s. Disntances
betwenn ID and E2 are calculated.
The intensitiy profiles of E2s are uploaded from source.npy.
Signal intensity of E2s. Distances
between ID and E2 are calculated.
The intensity profiles of E2s are uploaded from source.npy.

ownPeaks : obj:`list`of obj `dict`
List of modelled peaks for Y. Required to calculate apex distance,
which is equal to the euclidean dinstance of the closest peaks.
which is equal to the euclidean distance of the closest peaks.

metrices : obj:`list` of obj:`str` or obj`list` of obj`dict`
List of strings or dictionories of metrices used to calculate distance.
If dict is provided, two keys namely `fn`and `name`must be provided.
List of strings or dictionaries of metrices used to calculate distance.
If dict is provided, two keys namely `fn` and `name` must be provided.
The name must be unique (if more than one dict is provided.)

pathToTmp : string
Path to the temporary folder for the current anaylsis. Required to load
Path to the temporary folder for the current analysis. Required to load
Signals (called Ys)

chunkName : string
Expand Down
18 changes: 9 additions & 9 deletions src/modules/Predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self, classifierClass = "random forest", n_jobs = 4, gridSearch = N

def _initClassifier(self):
"""
Initiate Classifer
Initiate Classifier

Parameters
----------
Expand Down Expand Up @@ -127,11 +127,11 @@ def _scaleFeatures(self,X):
Feature scaling. Data are scaled by StandardScaler (0-1)

Importantly, the scaler is not retrained once it was initiated
to ensure that the scaling remains similiar for predictors.
to ensure that the scaling remains similar for predictors.

Parameters
----------
X : two dimensional numpy array (feature paris in rows)
X : two dimensional numpy array (feature pairs in rows)
Distance matrix for feature pairs


Expand All @@ -153,7 +153,7 @@ def _gridOptimization(self,X,Y):

Parameters
----------
X : two dimensional numpy array (feature paris in rows)
X : two dimensional numpy array (feature pairs in rows)
Distance matrix for feature pairs
Y : numpy array
Array containing class labels of X (0,1)
Expand Down Expand Up @@ -181,7 +181,7 @@ def _gridOptimization(self,X,Y):

def getFeatureImportance(self):
"""
Returns estimatore feature imporantance, if estimator allows for this.
Returns estimator feature importance, if estimator allows for this.

Parameters
----------
Expand Down Expand Up @@ -215,7 +215,7 @@ def predict(self,X,scale=True):
Returns
-------
Two dimensional array (n feature pairs x predictors)
containing the class proability
containing the class probability
if predictors (default: 3 - see fit function)

"""
Expand Down Expand Up @@ -247,7 +247,7 @@ def fit(self, X, Y, kFold = 3, optimizedParams=None, pathToResults = '', plotROC
X : two dimensional numpy array
Distance matrix for feature pairs
Y : np.array
Class labels (1 - 0) for postive
Class labels (1 - 0) for positive
and negative interaction
kFold : int
Number of cross validations. Equals the number of predictors.
Expand Down Expand Up @@ -275,7 +275,7 @@ def fit(self, X, Y, kFold = 3, optimizedParams=None, pathToResults = '', plotROC
if self.gridSerach is not None and optimizedParams is None:
optimizedClassifier, optimizedParams = self._gridOptimization(X_train,y_train)
else:
print("Info :: Grid serach skipped. Automatically skipped when using Guassian NB or parameter 'classiferGridSearch' is None.")
print("Info :: Grid search skipped. Automatically skipped when using Gaussian NB or parameter 'classiferGridSearch' is None.")
optimizedClassifier = self.classifier
#cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2)
if optimizedParams is not None:
Expand All @@ -284,7 +284,7 @@ def fit(self, X, Y, kFold = 3, optimizedParams=None, pathToResults = '', plotROC

self.predictors = [optimizedClassifier]
probasOut = optimizedClassifier.predict_proba(X)
#predict probabiliteis for complete data set to create a classfier report.
#predict probabilities for complete data set to create a classifier report.
tprs = []
aucs = []
oobScore = np.nan
Expand Down
34 changes: 17 additions & 17 deletions src/modules/Signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def __init__(self,
"""Signal module for pre-processing and modeling


The Signal module allows to do severl pre-processing/modelling
The Signal module allows to do several pre-processing/modelling
steps such as
a) smoothing (rolling average)
b) filtering by number of nonNaN values
c) removal of single data points (surrounded by zeros or nans)
b) Peak detection (finds peaks) - required for further anaylsis
b) Peak detection (finds peaks) - required for further analysis

The peak modelling allows for usage of `LorentzianModel` or `GaussianModel`

Expand Down Expand Up @@ -131,34 +131,34 @@ def _removeSingleDataPointPeaks(self):

"""
peaksFiltered = 0
flilteredY = []
filteredY = []

for i,x in enumerate(self.Y):
if i == 0: #first item is different
if self.Y[i+1] == 0:
flilteredY.append(0)
filteredY.append(0)
if self.Y[i] > 0:
peaksFiltered += 1
else:
flilteredY.append(x)
filteredY.append(x)

elif i == self.Y.size - 1: #last item also
if self.Y[-1] != 0 and self.Y[-1]:
flilteredY.append(0)
filteredY.append(0)
if self.Y[i] > 0:
peaksFiltered += 1
else:
flilteredY.append(x)
filteredY.append(x)

else:
if self.Y[i-1] == 0 and self.Y[i+1] == 0:
flilteredY.append(0)
filteredY.append(0)
if self.Y[i] > 0:
peaksFiltered += 1
else:
flilteredY.append(x)
filteredY.append(x)

return np.array(flilteredY), peaksFiltered
return np.array(filteredY), peaksFiltered

def isValid(self, nonZero = 4):
"""Returns true if signal contains more than
Expand All @@ -173,7 +173,7 @@ def isValid(self, nonZero = 4):

Returns
-------
boolean, True if vald
boolean, True if valid

"""
valid = np.sum(self.Y > 0) > nonZero
Expand Down Expand Up @@ -241,15 +241,15 @@ def _addParams(self,modelParams,prefix,peakIdx,i):
Parameters
----------

mdeolParams :
modelParams :
modelParam object. Returned by model.make_params() (lmfit package)
Documentation: https://lmfit.github.io/lmfit-py/model.html

prefix : str
Prefix for the model (e.g. peak), defaults to f'm{i}_'.format(i)

peakIdx : int
Arary index at which the peak was detected in the Signal arary self.Y
Array index at which the peak was detected in the Signal array self.Y

i : int
index of detected models
Expand All @@ -263,7 +263,7 @@ def _addParams(self,modelParams,prefix,peakIdx,i):


if self.avoidWideSmallPeaks and self.Y[peakIdx[i]] < np.max(self.Y) * 0.2:
#small peaks should not be to wide!
#small peaks should not be too wide!
self._addParam(modelParams,
name=prefix+'amplitude',
max = self.Y[peakIdx[i]] * 1.2 * np.pi,
Expand Down Expand Up @@ -328,7 +328,7 @@ def _findParametersForModels(self,spec,peakIdx):
def _checkPeakIdx(self,peakIdx, maxPeaks = 15):
"""
Checks if number of peaks exceed the max number of
allwed peaks. (paramater: maxPeaks)
allowed peaks. (parameter: maxPeaks)

If the number exceeds maxPeaks, the peaks with the
highest value are taken. Others are removed
Expand Down Expand Up @@ -362,7 +362,7 @@ def fitModel(self):
"""
Fits the model (ensemble of several peaks).
The number of models equals the number of
detected peaks. Please not that that the maximum
detected peaks. Please note that the maximum
number of peaks is limited by the parameter:

maxPeaks (defaults to 12)
Expand All @@ -371,7 +371,7 @@ def fitModel(self):

- peak models + signal profile are plotted and saved as pdf (folder modelPlots)

- if squaredR for the model fit is below threshold (r2Tresh - deufault 0.85), the
- if squaredR for the model fit is below threshold (r2Tresh - default 0.85), the
signal profile is ignored. A message is printed if this happens.

Parameters
Expand Down
Loading