Materials-Data-Science-and-Informatics
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎dashboard/analysis.py‎
Lines changed: 144 additions & 9 deletions b/‎dashboard/analysis.py‎
Lines changed: 144 additions & 9 deletions
diff --git a/‎dashboard/data/display_specifications/barchart_allowed.txt‎
Lines changed: 0 additions & 16 deletions b/‎dashboard/data/display_specifications/barchart_allowed.txt‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎dashboard/data/display_specifications/corr_chart_allowed.txt‎
Lines changed: 0 additions & 16 deletions b/‎dashboard/data/display_specifications/corr_chart_allowed.txt‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎dashboard/data/display_specifications/filters.txt‎
Lines changed: 0 additions & 9 deletions b/‎dashboard/data/display_specifications/filters.txt‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎dashboard/data/display_specifications/filters_methods.txt‎
Lines changed: 0 additions & 6 deletions b/‎dashboard/data/display_specifications/filters_methods.txt‎
Lines changed: 0 additions & 6 deletions
@@ -1,3 +1,6 @@
+# custom
+*.csv
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -38,7 +38,8 @@ def calculate_crosstab(df: pd.DataFrame, data_key1: str, data_key2: str, id_vars
     return df_crosstab
 
 
-def filter_dataframe(df: pd.DataFrame, include: list=None, exclude: List[Tuple[str, list]]=None, exclude_nan=True, as_type="category") -> pd.DataFrame:
+def filter_dataframe(df: pd.DataFrame, include: list=None, exclude: List[Tuple[str, list]]=None, 
+                exclude_nan=True, exclude_anonymized=True, as_type="category") -> pd.DataFrame:
     """
     Filter pandas dataframe
 
@@ -48,29 +49,163 @@ def filter_dataframe(df: pd.DataFrame, include: list=None, exclude: List[Tuple[s
     df = filter_dataframe(surveydata, include=["careerLevel", "docStructured", "researchArea"], exclude=[("careerLevel", to_exclude)])
     ```
     """
-    
+
+
     if include is not None:
         df = df[include].dropna(how = "all", subset = include).astype(as_type)
 
     for key, val in exclude:
-        print(key, val)
+        #print(key, val)
         df = df.loc[~df[key].isin(val)]
 
     if exclude_nan:
         for key in df.keys():
             df = df.loc[~df[key].isna()]
+
+    if exclude_anonymized:
+        df = df.replace(to_replace="Anonymized", value="") 
+
     return df
 
-def get_all_values(df: pd.DataFrame, key: str) -> dict:
+def get_all_values(df: pd.DataFrame, keylist: List[str], display_dict=None) -> dict:
     """
-    Count all values of a given key in a data frame and 
+    Count all values of a given key from a key list in a data frame and 
     return these values in a dictionary sorted.
+
     """
-    all_areas = df[key].value_counts()
-    all_areas = all_areas.sort_index()
-    data = {'All': all_areas.values, key:list(all_areas.keys())}
+    if len(keylist) == 1:
+        key = keylist[0]
+        all_areas = df[key].value_counts()
+        all_areas = all_areas.sort_index()
+        data = {'All': all_areas.values, key:list(all_areas.keys())}
+    else: # multiple keys now the keys become the xticks
+        combined = {}
+        for key in keylist:
+            if display_dict is not None:
+                xtick = display_dict[key]
+            else:
+                xtick = key
+            xtick = xtick.replace(' \n', '')
+            temp = df[key]
+            temp.replace(to_replace=True, value=xtick, inplace=True)
+            temp.replace(to_replace=False, value=None, inplace=True)
+            a = temp.value_counts()
+            if a.empty:
+                combined[xtick] = 0
+            else:
+                for i, ke in enumerate(a.keys()):
+                    # because other can contain all... others..
+                    ke = ke.lower() # sometimes there are mixed upper and lower case keys...
+                    ke = ke.replace(' \n', '') # some are with and without breaks
+                    temp_val = combined.get(ke, 0)
+                    temp_val = temp_val + a.values[i]
+                    combined[ke] = temp_val
+            # greedy, there is probably a pandas way to do this...
+            # there is a problem if df is empty, i.e temp.value_counts() True 0
+            for i, ke in enumerate(a.keys()):
+                ke = ke.lower() # sometimes there are mixed upper and lower case keys...
+                ke = ke.replace(' \n', '') # some are with and without breaks
+                temp_val = combined.get(ke, 0)
+                temp_val = temp_val + a.values[i]
+                combined[ke] = temp_val
+        data = {'All' : list(combined.values()), key:list(combined.keys())}
     return data
 
+def prepare_data_research_field(df: pd.DataFrame, keylist:List[str], key2:str='researchArea', sort_as=None, display_dict= None):# -> dict, list:
+    """Creates a dict dictionary with data in the form needed by the plotting functions
+    
+    We prepare several outputs, i.e y_keys because they can have different length and one should be able to create a 
+    ColumnDataSource by ColumnDataSource(data=data)
+    :param df: [description]
+    :type df: pd.DataFrame
+    :param key: [description]
+    :type key: str
+
+    example:
+    prepare_data_research_field(df, key=careerLevel)
+        {'Cum. Sum': array([  0,   0, 130, 128, 148, 272,   0]),
+         'careerLevel': ['Director (of the institute)',
+          'Other',
+          'PhD student',
+          'Postdoc',
+          'Principal Investigator',
+          'Research associate',
+          'Undergraduate / Masters student'],
+         'researchArea': ['Engineering Science',
+          'Physics',
+          'Life Science',
+          'Earth Science',
+          'Chemistry',
+          'Other',
+          'Psychology',
+          'Mathematics'],
+         'Engineering Science': array([  0,   0,  47,  30,  52, 134,   0]),
+         'Physics': array([ 0,  0, 33, 38, 39, 57,  0]),
+         'Life Science': array([ 0,  0, 28, 29, 27, 33,  0]),
+         'Earth Science': array([ 0,  0,  8, 11, 18, 32,  0]),
+         'Chemistry': array([ 0,  0,  9, 12,  6,  4,  0]),
+         'Other': array([0, 0, 1, 2, 3, 6, 0]),
+         'Psychology': array([0, 0, 3, 2, 3, 2, 0]),
+         'Mathematics': array([0, 0, 1, 4, 0, 4, 0])}
+
+    """
+    research_areas = list(df[key2].value_counts().keys())
+    y_keys = ['Cum. Sum'] + research_areas
+    # Multiple columns will be combined. A single column will be treated differently
+    if len(keylist) == 1:
+        key = keylist[0]
+        all_areas = df[key].value_counts()
+        all_areas = all_areas.sort_index()
+        data = {'Cum. Sum': all_areas.values, key:list(all_areas.keys()), 'x_value': list(all_areas.keys())}
+        for area in research_areas:
+            area_counts = df[df[key2] == area][key].value_counts()
+            area_counts = area_counts.sort_index()
+            data[area] = area_counts.values
+    else:        
+        combined = {}
+        data = {}
+        for key in keylist:
+            if display_dict is not None:
+                xtick = display_dict[key]
+            else:
+                xtick = key
+            xtick = xtick.replace(' \n', '')
+            temp = df[key]
+            temp.replace(to_replace=True, value=xtick, inplace=True)
+            temp.replace(to_replace=False, value=None, inplace=True)
+            a = temp.value_counts()
+            # greedy, there is probably a pandas way to do this...
+            # there is a problem if df is empty, i.e temp.value_counts() True 0
+            for i, ke in enumerate(a.keys()):
+                # because other can contain all... others..
+                ke = ke.lower() # sometimes there are mixed upper and lower case keys...
+                ke = ke.replace(' \n', '') # some are with and without breaks
+                temp_val = combined.get(ke, 0)
+                temp_val = temp_val + a.values[i]
+                combined[ke] = temp_val
+        
+            for area in research_areas:
+                area_counts = df[df[key2] == area][key]
+                temp = data.get(area, [])
+
+                area_counts.replace(to_replace=True, value=xtick, inplace=True)
+                area_counts.replace(to_replace=False, value=None, inplace=True)
+                area_counts.value_counts()
+                area_counts = area_counts.sort_index()
+                
+                print(area_counts)
+                if area_counts.empty:
+                    temp.append(0)
+                else:
+                    temp.append(list(area_counts.values))
+                data[area] = temp
+        
+        data['Cum. Sum'] = list(combined.values())
+        data['x_value'] = list(combined.keys())
+
+    return data, y_keys
+
+'''
 def prepare_data_research_field(df: pd.DataFrame, key:str, key2:str='researchArea', sort_as=None):# -> dict, list:
     """Creates a dict dictionary with data in the form needed by the plotting functions
     
@@ -120,7 +255,7 @@ def prepare_data_research_field(df: pd.DataFrame, key:str, key2:str='researchAre
         data[area] = area_counts.values
     
     return data, y_keys
-
+'''
 '''
 def prepare_data_research_field(df: pd.DataFrame, key:str):
     """AI is creating summary for prepare_data_researchfield