@@ -38,7 +38,8 @@ def calculate_crosstab(df: pd.DataFrame, data_key1: str, data_key2: str, id_vars
3838 return df_crosstab
3939
4040
41- def filter_dataframe (df : pd .DataFrame , include : list = None , exclude : List [Tuple [str , list ]]= None , exclude_nan = True , as_type = "category" ) -> pd .DataFrame :
41+ def filter_dataframe (df : pd .DataFrame , include : list = None , exclude : List [Tuple [str , list ]]= None ,
42+ exclude_nan = True , exclude_anonymized = True , as_type = "category" ) -> pd .DataFrame :
4243 """
4344 Filter pandas dataframe
4445
@@ -48,29 +49,163 @@ def filter_dataframe(df: pd.DataFrame, include: list=None, exclude: List[Tuple[s
4849 df = filter_dataframe(surveydata, include=["careerLevel", "docStructured", "researchArea"], exclude=[("careerLevel", to_exclude)])
4950 ```
5051 """
51-
52+
53+
5254 if include is not None :
5355 df = df [include ].dropna (how = "all" , subset = include ).astype (as_type )
5456
5557 for key , val in exclude :
56- print (key , val )
58+ # print(key, val)
5759 df = df .loc [~ df [key ].isin (val )]
5860
5961 if exclude_nan :
6062 for key in df .keys ():
6163 df = df .loc [~ df [key ].isna ()]
64+
65+ if exclude_anonymized :
66+ df = df .replace (to_replace = "Anonymized" , value = "" )
67+
6268 return df
6369
64- def get_all_values (df : pd .DataFrame , key : str ) -> dict :
70+ def get_all_values (df : pd .DataFrame , keylist : List [ str ], display_dict = None ) -> dict :
6571 """
66- Count all values of a given key in a data frame and
72+ Count all values of a given key from a key list in a data frame and
6773 return these values in a dictionary sorted.
74+
6875 """
69- all_areas = df [key ].value_counts ()
70- all_areas = all_areas .sort_index ()
71- data = {'All' : all_areas .values , key :list (all_areas .keys ())}
76+ if len (keylist ) == 1 :
77+ key = keylist [0 ]
78+ all_areas = df [key ].value_counts ()
79+ all_areas = all_areas .sort_index ()
80+ data = {'All' : all_areas .values , key :list (all_areas .keys ())}
81+ else : # multiple keys now the keys become the xticks
82+ combined = {}
83+ for key in keylist :
84+ if display_dict is not None :
85+ xtick = display_dict [key ]
86+ else :
87+ xtick = key
88+ xtick = xtick .replace (' \n ' , '' )
89+ temp = df [key ]
90+ temp .replace (to_replace = True , value = xtick , inplace = True )
91+ temp .replace (to_replace = False , value = None , inplace = True )
92+ a = temp .value_counts ()
93+ if a .empty :
94+ combined [xtick ] = 0
95+ else :
96+ for i , ke in enumerate (a .keys ()):
97+ # because other can contain all... others..
98+ ke = ke .lower () # sometimes there are mixed upper and lower case keys...
99+ ke = ke .replace (' \n ' , '' ) # some are with and without breaks
100+ temp_val = combined .get (ke , 0 )
101+ temp_val = temp_val + a .values [i ]
102+ combined [ke ] = temp_val
103+ # greedy, there is probably a pandas way to do this...
104+ # there is a problem if df is empty, i.e temp.value_counts() True 0
105+ for i , ke in enumerate (a .keys ()):
106+ ke = ke .lower () # sometimes there are mixed upper and lower case keys...
107+ ke = ke .replace (' \n ' , '' ) # some are with and without breaks
108+ temp_val = combined .get (ke , 0 )
109+ temp_val = temp_val + a .values [i ]
110+ combined [ke ] = temp_val
111+ data = {'All' : list (combined .values ()), key :list (combined .keys ())}
72112 return data
73113
114+ def prepare_data_research_field (df : pd .DataFrame , keylist :List [str ], key2 :str = 'researchArea' , sort_as = None , display_dict = None ):# -> dict, list:
115+ """Creates a dict dictionary with data in the form needed by the plotting functions
116+
117+ We prepare several outputs, i.e y_keys because they can have different length and one should be able to create a
118+ ColumnDataSource by ColumnDataSource(data=data)
119+ :param df: [description]
120+ :type df: pd.DataFrame
121+ :param key: [description]
122+ :type key: str
123+
124+ example:
125+ prepare_data_research_field(df, key=careerLevel)
126+ {'Cum. Sum': array([ 0, 0, 130, 128, 148, 272, 0]),
127+ 'careerLevel': ['Director (of the institute)',
128+ 'Other',
129+ 'PhD student',
130+ 'Postdoc',
131+ 'Principal Investigator',
132+ 'Research associate',
133+ 'Undergraduate / Masters student'],
134+ 'researchArea': ['Engineering Science',
135+ 'Physics',
136+ 'Life Science',
137+ 'Earth Science',
138+ 'Chemistry',
139+ 'Other',
140+ 'Psychology',
141+ 'Mathematics'],
142+ 'Engineering Science': array([ 0, 0, 47, 30, 52, 134, 0]),
143+ 'Physics': array([ 0, 0, 33, 38, 39, 57, 0]),
144+ 'Life Science': array([ 0, 0, 28, 29, 27, 33, 0]),
145+ 'Earth Science': array([ 0, 0, 8, 11, 18, 32, 0]),
146+ 'Chemistry': array([ 0, 0, 9, 12, 6, 4, 0]),
147+ 'Other': array([0, 0, 1, 2, 3, 6, 0]),
148+ 'Psychology': array([0, 0, 3, 2, 3, 2, 0]),
149+ 'Mathematics': array([0, 0, 1, 4, 0, 4, 0])}
150+
151+ """
152+ research_areas = list (df [key2 ].value_counts ().keys ())
153+ y_keys = ['Cum. Sum' ] + research_areas
154+ # Multiple columns will be combined. A single column will be treated differently
155+ if len (keylist ) == 1 :
156+ key = keylist [0 ]
157+ all_areas = df [key ].value_counts ()
158+ all_areas = all_areas .sort_index ()
159+ data = {'Cum. Sum' : all_areas .values , key :list (all_areas .keys ()), 'x_value' : list (all_areas .keys ())}
160+ for area in research_areas :
161+ area_counts = df [df [key2 ] == area ][key ].value_counts ()
162+ area_counts = area_counts .sort_index ()
163+ data [area ] = area_counts .values
164+ else :
165+ combined = {}
166+ data = {}
167+ for key in keylist :
168+ if display_dict is not None :
169+ xtick = display_dict [key ]
170+ else :
171+ xtick = key
172+ xtick = xtick .replace (' \n ' , '' )
173+ temp = df [key ]
174+ temp .replace (to_replace = True , value = xtick , inplace = True )
175+ temp .replace (to_replace = False , value = None , inplace = True )
176+ a = temp .value_counts ()
177+ # greedy, there is probably a pandas way to do this...
178+ # there is a problem if df is empty, i.e temp.value_counts() True 0
179+ for i , ke in enumerate (a .keys ()):
180+ # because other can contain all... others..
181+ ke = ke .lower () # sometimes there are mixed upper and lower case keys...
182+ ke = ke .replace (' \n ' , '' ) # some are with and without breaks
183+ temp_val = combined .get (ke , 0 )
184+ temp_val = temp_val + a .values [i ]
185+ combined [ke ] = temp_val
186+
187+ for area in research_areas :
188+ area_counts = df [df [key2 ] == area ][key ]
189+ temp = data .get (area , [])
190+
191+ area_counts .replace (to_replace = True , value = xtick , inplace = True )
192+ area_counts .replace (to_replace = False , value = None , inplace = True )
193+ area_counts .value_counts ()
194+ area_counts = area_counts .sort_index ()
195+
196+ print (area_counts )
197+ if area_counts .empty :
198+ temp .append (0 )
199+ else :
200+ temp .append (list (area_counts .values ))
201+ data [area ] = temp
202+
203+ data ['Cum. Sum' ] = list (combined .values ())
204+ data ['x_value' ] = list (combined .keys ())
205+
206+ return data , y_keys
207+
208+ '''
74209def prepare_data_research_field(df: pd.DataFrame, key:str, key2:str='researchArea', sort_as=None):# -> dict, list:
75210 """Creates a dict dictionary with data in the form needed by the plotting functions
76211
@@ -120,7 +255,7 @@ def prepare_data_research_field(df: pd.DataFrame, key:str, key2:str='researchAre
120255 data[area] = area_counts.values
121256
122257 return data, y_keys
123-
258+ '''
124259'''
125260def prepare_data_research_field(df: pd.DataFrame, key:str):
126261 """AI is creating summary for prepare_data_researchfield
0 commit comments