1+
2+ import pandas as pd
3+ from typing import List , Tuple
4+
5+ def calculate_crosstab (df : pd .DataFrame , data_key1 : str , data_key2 : str , id_vars : str = None , astype : str = "int" ) -> pd .DataFrame :
6+ """Calulate the cross table for two keys in a given pandas data frame"""
7+ if id_vars is None :
8+ id_vars = data_key1
9+
10+ cols = [data_key1 , data_key2 ]
11+
12+ df0 = df [cols ].dropna (how = "all" , subset = cols ).astype ("category" )
13+ totals = dict (df0 [data_key1 ].value_counts ())
14+
15+ df_crosstab = pd .crosstab (df [data_key1 ], df [data_key2 ],
16+ margins = False ).reset_index ().melt (id_vars = [id_vars ])
17+
18+ # include total answers by career level in cross tab
19+ df_crosstab ["total" ] = df_crosstab [data_key1 ].map (totals ).astype (astype )
20+
21+ # calculate relative amount of answers by career level
22+ df_crosstab ["percentage" ] = (df_crosstab ["value" ] / df_crosstab ["total" ]) * 100
23+
24+ return df_crosstab
25+
26+
27+ def filter_dataframe (df : pd .DataFrame , include : list = None , exclude : List [Tuple [str , list ]]= None , exclude_nan = True ) -> pd .DataFrame :
28+ """
29+ Filter pandas dataframe
30+
31+ example:
32+ ```
33+ to_exclude = ['Other', 'Undergraduate / Masters student', 'Director (of the institute)']
34+ df = filter_dataframe(surveydata, include=["careerLevel", "docStructured", "researchArea"], exclude=[("careerLevel", to_exclude)])
35+ ```
36+ """
37+
38+ if include is not None :
39+ df = df [include ].dropna (how = "all" , subset = include ).astype ("category" )
40+
41+ for key , val in exclude :
42+ df = df .loc [~ df [key ].isin (val )]
43+
44+ if exclude_nan :
45+ for key in df .keys ():
46+ df = df .loc [~ df [key ].isna ()]
47+ return df
48+
49+
50+ def prepare_data_research_field (df : pd .DataFrame , key :str , key2 :str = 'researchArea' ):# -> dict, list:
51+ """Creates a dict dictionary with data in the form needed by the plotting functions
52+
53+ We prepare several outputs, i.e y_keys because they can have different length and one should be able to create a
54+ ColumnDataSource by ColumnDataSource(data=data)
55+ :param df: [description]
56+ :type df: pd.DataFrame
57+ :param key: [description]
58+ :type key: str
59+
60+ example:
61+ prepare_data_research_field(df, key=careerLevel)
62+ {'All': array([ 0, 0, 130, 128, 148, 272, 0]),
63+ 'careerLevel': ['Director (of the institute)',
64+ 'Other',
65+ 'PhD student',
66+ 'Postdoc',
67+ 'Principal Investigator',
68+ 'Research associate',
69+ 'Undergraduate / Masters student'],
70+ 'researchArea': ['Engineering Science',
71+ 'Physics',
72+ 'Life Science',
73+ 'Earth Science',
74+ 'Chemistry',
75+ 'Other',
76+ 'Psychology',
77+ 'Mathematics'],
78+ 'Engineering Science': array([ 0, 0, 47, 30, 52, 134, 0]),
79+ 'Physics': array([ 0, 0, 33, 38, 39, 57, 0]),
80+ 'Life Science': array([ 0, 0, 28, 29, 27, 33, 0]),
81+ 'Earth Science': array([ 0, 0, 8, 11, 18, 32, 0]),
82+ 'Chemistry': array([ 0, 0, 9, 12, 6, 4, 0]),
83+ 'Other': array([0, 0, 1, 2, 3, 6, 0]),
84+ 'Psychology': array([0, 0, 3, 2, 3, 2, 0]),
85+ 'Mathematics': array([0, 0, 1, 4, 0, 4, 0])}
86+
87+ """
88+ all_areas = df [key ].value_counts ()
89+ all_areas = all_areas .sort_index ()
90+ research_areas = list (df [key2 ].value_counts ().keys ())
91+ data = {'All' : all_areas .values , key :list (all_areas .keys ()), 'x_value' : list (all_areas .keys ())}
92+ y_keys = ['All' ] + research_areas
93+ for area in research_areas :
94+ area_counts = df [df [key2 ] == area ][key ].value_counts ()
95+ area_counts = area_counts .sort_index ()
96+ data [area ] = area_counts .values
97+
98+ return data , y_keys
99+
100+ '''
101+ def prepare_data_research_field(df: pd.DataFrame, key:str):
102+ """AI is creating summary for prepare_data_researchfield
103+
104+ :param df: [description]
105+ :type df: pd.DataFrame
106+ :param key: [description]
107+ :type key: str
108+ """
109+ all_areas = df[key].value_counts()
110+ all_areas = all_areas.sort_index()
111+ data = {'All': {'counts': all_areas.values, 'values': list(all_areas.keys())}}
112+ research_areas = list(df['researchArea'].value_counts().keys())
113+ for area in research_areas:
114+ area_counts = df[df["researchArea"] == area][key].value_counts()
115+ area_counts = area_counts.sort_index()
116+ data[area] = {'counts': area_counts.values, 'values': list(area_counts.keys())}
117+
118+ return data
119+ '''
0 commit comments