Skip to content

Commit 641f846

Browse files
committed
Add first version of dashboard with more advanced barplot and survey data
1 parent 86d531a commit 641f846

14 files changed

Lines changed: 7697 additions & 786 deletions

dashboard/analysis.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
2+
import pandas as pd
3+
from typing import List, Tuple
4+
5+
def calculate_crosstab(df: pd.DataFrame, data_key1: str, data_key2: str, id_vars: str=None, astype: str="int") -> pd.DataFrame:
6+
"""Calulate the cross table for two keys in a given pandas data frame"""
7+
if id_vars is None:
8+
id_vars = data_key1
9+
10+
cols = [data_key1, data_key2]
11+
12+
df0 = df[cols].dropna(how = "all", subset = cols).astype("category")
13+
totals = dict(df0[data_key1].value_counts())
14+
15+
df_crosstab = pd.crosstab(df[data_key1], df[data_key2],
16+
margins = False).reset_index().melt(id_vars = [id_vars])
17+
18+
# include total answers by career level in cross tab
19+
df_crosstab["total"] = df_crosstab[data_key1].map(totals).astype(astype)
20+
21+
# calculate relative amount of answers by career level
22+
df_crosstab["percentage"] = (df_crosstab["value"] / df_crosstab["total"]) * 100
23+
24+
return df_crosstab
25+
26+
27+
def filter_dataframe(df: pd.DataFrame, include: list=None, exclude: List[Tuple[str, list]]=None, exclude_nan=True) -> pd.DataFrame:
28+
"""
29+
Filter pandas dataframe
30+
31+
example:
32+
```
33+
to_exclude = ['Other', 'Undergraduate / Masters student', 'Director (of the institute)']
34+
df = filter_dataframe(surveydata, include=["careerLevel", "docStructured", "researchArea"], exclude=[("careerLevel", to_exclude)])
35+
```
36+
"""
37+
38+
if include is not None:
39+
df = df[include].dropna(how = "all", subset = include).astype("category")
40+
41+
for key, val in exclude:
42+
df = df.loc[~df[key].isin(val)]
43+
44+
if exclude_nan:
45+
for key in df.keys():
46+
df = df.loc[~df[key].isna()]
47+
return df
48+
49+
50+
def prepare_data_research_field(df: pd.DataFrame, key:str, key2:str='researchArea'):# -> dict, list:
51+
"""Creates a dict dictionary with data in the form needed by the plotting functions
52+
53+
We prepare several outputs, i.e y_keys because they can have different length and one should be able to create a
54+
ColumnDataSource by ColumnDataSource(data=data)
55+
:param df: [description]
56+
:type df: pd.DataFrame
57+
:param key: [description]
58+
:type key: str
59+
60+
example:
61+
prepare_data_research_field(df, key=careerLevel)
62+
{'All': array([ 0, 0, 130, 128, 148, 272, 0]),
63+
'careerLevel': ['Director (of the institute)',
64+
'Other',
65+
'PhD student',
66+
'Postdoc',
67+
'Principal Investigator',
68+
'Research associate',
69+
'Undergraduate / Masters student'],
70+
'researchArea': ['Engineering Science',
71+
'Physics',
72+
'Life Science',
73+
'Earth Science',
74+
'Chemistry',
75+
'Other',
76+
'Psychology',
77+
'Mathematics'],
78+
'Engineering Science': array([ 0, 0, 47, 30, 52, 134, 0]),
79+
'Physics': array([ 0, 0, 33, 38, 39, 57, 0]),
80+
'Life Science': array([ 0, 0, 28, 29, 27, 33, 0]),
81+
'Earth Science': array([ 0, 0, 8, 11, 18, 32, 0]),
82+
'Chemistry': array([ 0, 0, 9, 12, 6, 4, 0]),
83+
'Other': array([0, 0, 1, 2, 3, 6, 0]),
84+
'Psychology': array([0, 0, 3, 2, 3, 2, 0]),
85+
'Mathematics': array([0, 0, 1, 4, 0, 4, 0])}
86+
87+
"""
88+
all_areas = df[key].value_counts()
89+
all_areas = all_areas.sort_index()
90+
research_areas = list(df[key2].value_counts().keys())
91+
data = {'All': all_areas.values, key:list(all_areas.keys()), 'x_value': list(all_areas.keys())}
92+
y_keys = ['All'] + research_areas
93+
for area in research_areas:
94+
area_counts = df[df[key2] == area][key].value_counts()
95+
area_counts = area_counts.sort_index()
96+
data[area] = area_counts.values
97+
98+
return data, y_keys
99+
100+
'''
101+
def prepare_data_research_field(df: pd.DataFrame, key:str):
102+
"""AI is creating summary for prepare_data_researchfield
103+
104+
:param df: [description]
105+
:type df: pd.DataFrame
106+
:param key: [description]
107+
:type key: str
108+
"""
109+
all_areas = df[key].value_counts()
110+
all_areas = all_areas.sort_index()
111+
data = {'All': {'counts': all_areas.values, 'values': list(all_areas.keys())}}
112+
research_areas = list(df['researchArea'].value_counts().keys())
113+
for area in research_areas:
114+
area_counts = df[df["researchArea"] == area][key].value_counts()
115+
area_counts = area_counts.sort_index()
116+
data[area] = {'counts': area_counts.values, 'values': list(area_counts.keys())}
117+
118+
return data
119+
'''

dashboard/data/20211130_HMCCommSurvey_clean.csv

Lines changed: 1164 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
careerLevel
2+
centerAffiliation
3+
dataFormats
4+
dataGenMethodSpec
5+
dataGenMethod
6+
Hub
7+
researchAreaCombined
8+
yearsInResearch
9+
software
10+
servNeeds_sub
11+
servFormat
12+
researchArea
13+
pubMotivation
14+
pubObstaclesA
15+
docStructured
16+
docStandards

dashboard/data/filters.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
All
2-
AST
3-
Earth & Environment
4-
Energy
5-
Health
6-
Information
7-
Matter
2+
Chemistry
3+
Earth Science
4+
Engineering Science
5+
Life Science
6+
Mathematics
7+
Other
8+
Physics
9+
Psychology

dashboard/data/filters_hub.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
All
2+
AST
3+
Earth & Environment
4+
Energy
5+
Health
6+
Information
7+
Matter

dashboard/data/germany_map.svg

Lines changed: 1 addition & 0 deletions
Loading

0 commit comments

Comments
 (0)