Spaces:

holistic-ai
/

job-fair

Sleeping

App Files Files Community

wu981526092 commited on May 30, 2024

Commit

d358781

verified ·

1 Parent(s): 18c89c6

propotion (#5)

Browse files

- injection propotion modified (9da56e276b8063f7d5ac10c7a47b4f2e59176297)
- change the summary function to not include proportion as an argument (d31a18ac8f0c9f8ce41b130e9bd7d202d130ed3b)

Files changed (9) hide show

pages/1_Injection.py +21 -7
resume_chunked.csv +3 -0
resume_subsampled.csv +2 -2
util/__pycache__/__init__.cpython-311.pyc +0 -0
util/__pycache__/evaluation.cpython-311.pyc +0 -0
util/__pycache__/injection.cpython-311.pyc +0 -0
util/__pycache__/model.cpython-311.pyc +0 -0
util/__pycache__/prompt.cpython-311.pyc +0 -0
util/injection.py +13 -13

pages/1_Injection.py CHANGED Viewed

@@ -10,7 +10,8 @@ st.title('Result Generation')
 def check_password():
     def password_entered():
-        if password_input == os.getenv('PASSWORD'):
             st.session_state['password_correct'] = True
         else:
             st.error("Incorrect Password, please try again.")
@@ -28,12 +29,18 @@ def initialize_state():
             "data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
             "uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
     defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
-                "Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.",1,PROMPT_TEMPLATE]
     for key, default in zip(keys, defaults):
         if key not in st.session_state:
             st.session_state[key] = default
 if not st.session_state.get('password_correct', False):
     check_password()
 else:
@@ -80,17 +87,23 @@ else:
             st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
             st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
-            st.session_state.proportion = st.number_input("Proportion", 0.0, 1.0, float(st.session_state.proportion), 0.01)
             st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
             st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
             st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
             #st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
-            st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
             df = df[df["Occupation"] == st.session_state.occupation]
-            df = df.sample(n=st.session_state.sample_size,random_state=42)
             st.write('Data:', df)
             if st.button('Process Data') and not st.session_state.data_processed:
@@ -104,7 +117,7 @@ else:
                 with st.spinner('Processing data...'):
                     parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
-                    preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.proportion,st.session_state.prompt_template)
                     st.session_state.data_processed = True  # Mark as processed
                 st.write('Processed Data:', preprocessed_df)
@@ -128,3 +141,4 @@ else:
                 st.session_state.num_run = 1
                 st.session_state.data_processed = False
                 st.session_state.uploaded_file = None

 def check_password():
     def password_entered():
+        # if password_input == os.getenv('PASSWORD'):
+        if password_input == "  ":
             st.session_state['password_correct'] = True
         else:
             st.error("Incorrect Password, please try again.")
             "data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
             "uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
     defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
+                "Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.", 1.0 ,PROMPT_TEMPLATE]
     for key, default in zip(keys, defaults):
         if key not in st.session_state:
             st.session_state[key] = default
+def change_column_value(df_old, df_change, here_column, switch_to_column, common_column='Resume'):
+    merged_df = df_old.merge(df_change, on=common_column, how='left')
+    df_old[here_column] = merged_df[switch_to_column]
+    return df_old
 if not st.session_state.get('password_correct', False):
     check_password()
 else:
             st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
             st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
             st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
             st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
             st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
+            st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
             #st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
             df = df[df["Occupation"] == st.session_state.occupation]
+            if file_options == "Example":
+                st.session_state.proportion = st.slider("Proportion", 0.2, 1.0, float(st.session_state.proportion), 0.2)
+                df_chunked = pd.read_csv("resume_chunked.csv")
+                column_switch_to = f'{st.session_state.proportion}_diluted'
+                df = change_column_value(df, df_chunked, 'Cleaned_Resume', column_switch_to)
+            df = df.sample(n=st.session_state.sample_size, random_state=42)
             st.write('Data:', df)
             if st.button('Process Data') and not st.session_state.data_processed:
                 with st.spinner('Processing data...'):
                     parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
+                    preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.prompt_template)
                     st.session_state.data_processed = True  # Mark as processed
                 st.write('Processed Data:', preprocessed_df)
                 st.session_state.num_run = 1
                 st.session_state.data_processed = False
                 st.session_state.uploaded_file = None
+                st.session_state.proportion = 1.0

resume_chunked.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46b8ec7cd5618817dcb98860264aae8b9bf856cc4ac9e0a23f61a12ae72e290a
+size 7864679

resume_subsampled.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ae325b538c1e601fe44bb2d0377800c0a633a8a19bb6ecb5834386d24aa6bf2
-size 3845010

 version https://git-lfs.github.com/spec/v1
+oid sha256:ead8d4a52de48139bc0c98ab8e5b61210dd93e10856f024adf6f26570ea1353c
+size 3845012

util/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (176 Bytes). View file

util/__pycache__/evaluation.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

util/__pycache__/injection.cpython-311.pyc ADDED Viewed

Binary file (7.19 kB). View file

util/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (3.55 kB). View file

util/__pycache__/prompt.cpython-311.pyc ADDED Viewed

Binary file (1.41 kB). View file

util/injection.py CHANGED Viewed

@@ -5,13 +5,14 @@ import json_repair
 import pandas as pd
 from tqdm import tqdm
-def create_summary(group_name, label, occupation, row, proportion,template):
     """Generate a dynamic summary for scoring the applicant, excluding the group feature.
        The occupation parameter allows customization of the job position.
     """
     resume_info = row['Cleaned_Resume']
-    resume_info = resume_info[:int(len(resume_info) * proportion)]
     info = f"{group_name}: {label};" if label else ''
@@ -24,7 +25,7 @@ def create_summary(group_name, label, occupation, row, proportion,template):
     return summary
-def invoke_retry(prompt, agent, parameters,string_input=False):
     attempts = 0
     delay = 2  # Initial delay in seconds
     max_attempts = 5  # Maximum number of retry attempts
@@ -41,7 +42,8 @@ def invoke_retry(prompt, agent, parameters,string_input=False):
                 score_json = json.loads(score_text)
             except json.JSONDecodeError:
                 try:
-                    score_json = json.loads(json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
                 except json.JSONDecodeError:
                     raise Exception("Failed to decode JSON response even after repair attempt.")
             # score = re.search(r'\d+', score_text)
@@ -56,7 +58,7 @@ def invoke_retry(prompt, agent, parameters,string_input=False):
             attempts += 1
     return -1
-    #raise Exception("Failed to complete the API call after maximum retry attempts.")
 def calculate_avg_score(score_list):
@@ -66,18 +68,19 @@ def calculate_avg_score(score_list):
             avg_score = sum(valid_scores) / len(valid_scores)
             return avg_score
     return None
-def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation,proportion,template):
     print(f"Processing {len(df)} entries with {num_run} runs each.")
     """ Process entries and compute scores concurrently, with progress updates. """
-    scores = {key: [[] for _ in range(len(df))] for key in ['Privilege','Protect','Neutral']}
     for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
         for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
             for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
-                prompt_normal = create_summary(group_name, label, occupation,row,proportion,template)
                 print(f"Run {run + 1} - Entry {index + 1} - {key}")
                 print("=============================================================")
@@ -86,17 +89,14 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
     print(f"Scores: {scores}")
     # Ensure all scores are lists and calculate average scores
-    for category in ['Privilege', 'Protect','Neutral']:
         # Ensure the scores are lists and check before assignment
         series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
         df[f'{category}_Scores'] = series_data
         # Calculate the average score with additional debug info
         df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
     # Add ranks for each score within each row

 import pandas as pd
 from tqdm import tqdm
+def create_summary(group_name, label, occupation, row, template):
     """Generate a dynamic summary for scoring the applicant, excluding the group feature.
        The occupation parameter allows customization of the job position.
     """
     resume_info = row['Cleaned_Resume']
+    # resume_info = resume_info[:int(len(resume_info) * proportion)]
     info = f"{group_name}: {label};" if label else ''
     return summary
+def invoke_retry(prompt, agent, parameters, string_input=False):
     attempts = 0
     delay = 2  # Initial delay in seconds
     max_attempts = 5  # Maximum number of retry attempts
                 score_json = json.loads(score_text)
             except json.JSONDecodeError:
                 try:
+                    score_json = json.loads(
+                        json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
                 except json.JSONDecodeError:
                     raise Exception("Failed to decode JSON response even after repair attempt.")
             # score = re.search(r'\d+', score_text)
             attempts += 1
     return -1
+    # raise Exception("Failed to complete the API call after maximum retry attempts.")
 def calculate_avg_score(score_list):
             avg_score = sum(valid_scores) / len(valid_scores)
             return avg_score
     return None
+def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation
+                            , template):
     print(f"Processing {len(df)} entries with {num_run} runs each.")
     """ Process entries and compute scores concurrently, with progress updates. """
+    scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
     for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
         for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
             for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
+                prompt_normal = create_summary(group_name, label, occupation, row, template)
                 print(f"Run {run + 1} - Entry {index + 1} - {key}")
                 print("=============================================================")
     print(f"Scores: {scores}")
     # Ensure all scores are lists and calculate average scores
+    for category in ['Privilege', 'Protect', 'Neutral']:
         # Ensure the scores are lists and check before assignment
         series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
         df[f'{category}_Scores'] = series_data
         # Calculate the average score with additional debug info
         df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
     # Add ranks for each score within each row