sebastianalgharaballi commited on
Commit
d148b11
1 Parent(s): 3385eb9

working with basic boolean query

Browse files
Files changed (1) hide show
  1. test_two_phase_search.py +130 -64
test_two_phase_search.py CHANGED
@@ -1,74 +1,140 @@
1
- from typing import List, Dict, Optional
2
- from opensearch_client import OpenSearchClient
3
  from chroma_storage import ChromaMatchingSystem
4
- from embeddings import JobPosting
 
 
5
 
6
- class TwoPhaseSearchSystem:
7
- def __init__(self, chroma_matcher: ChromaMatchingSystem, opensearch_client: OpenSearchClient):
8
- self.chroma_matcher = chroma_matcher
9
- self.opensearch_client = opensearch_client
10
 
11
- def search_candidates(self,
12
- job_posting: JobPosting,
13
- search_params: Dict,
14
- n_results: int = 10) -> List[Dict]:
15
- """
16
- Two-phase search:
17
- 1. OpenSearch boolean filtering
18
- 2. ChromaDB embedding matching
19
- """
20
- # Phase 1: OpenSearch Filtering
21
- opensearch_results = self.opensearch_client.search_jobseekers(search_params)
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- if not opensearch_results:
24
- return []
 
 
25
 
26
- # Phase 2: ChromaDB Embedding Matching
27
- # Get matches only for jobseekers that passed OpenSearch filtering
28
- matches = self.chroma_matcher.get_matches(
29
- job_posting=job_posting,
30
- n_results=n_results,
31
- where_conditions={"jobseeker_id": {"$in": [r['jobseeker_id'] for r in opensearch_results]}}
32
- )
33
 
34
- # Combine OpenSearch scores with ChromaDB match results
35
- final_results = []
36
- for match in matches:
37
- # Find corresponding OpenSearch result
38
- opensearch_result = next(
39
- (r for r in opensearch_results if r['jobseeker_id'] == match.jobseeker_id),
40
- None
41
- )
42
- if opensearch_result:
43
- final_results.append({
44
- 'jobseeker_id': match.jobseeker_id,
45
- 'similarity_score': match.similarity_score,
46
- 'field_scores': match.field_scores,
47
- 'explanation': match.explanation,
48
- 'opensearch_score': opensearch_result.get('_score', 0.0)
49
- })
50
 
51
- return final_results
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- def build_search_params(self,
54
- job_posting: JobPosting,
55
- location: Optional[Dict] = None,
56
- certifications: Optional[Dict] = None,
57
- tags: Optional[List[Dict]] = None) -> Dict:
58
- """Build OpenSearch query parameters matching working format"""
59
- # Extract skills from job posting
60
- primary_skills = [skill.skill_name for skill in job_posting.primary_skills][:2] # Just first two skills
61
-
62
- # Build boolean query like the working example
63
- skills_part = ' OR '.join(primary_skills)
64
- boolean_query = f"Software AND ({skills_part})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Use exactly what worked
67
- params = {
68
- "boolean_search_query": boolean_query,
69
- "skills": [],
70
- "size": 100,
71
- "sort_by": ["score"]
72
- }
 
 
73
 
74
- return params
 
 
1
+ from test_embeddings import create_test_data, transform_jobseeker_to_opensearch, IndependentJobSeekerAssessmentRDS
 
2
  from chroma_storage import ChromaMatchingSystem
3
+ from opensearch_client import OpenSearchClient
4
+ from two_phase_search import TwoPhaseSearchSystem
5
+ import json
6
 
7
+ def test_single_jobseeker(opensearch_client):
8
+ """Test OpenSearch with one jobseeker"""
9
+ print("\nTesting OpenSearch with a single jobseeker...")
 
10
 
11
+ # Create one test jobseeker
12
+ test_jobseeker = IndependentJobSeekerAssessmentRDS(
13
+ primary_skills=["Python", "AWS"],
14
+ secondary_skills=["Docker", "CI/CD"],
15
+ experiences=[{
16
+ "title": "Software Engineer",
17
+ "company": "Tech Co.",
18
+ "duration": "3 years",
19
+ "description": "Developed scalable software systems and optimized cloud infrastructure."
20
+ }],
21
+ educations=[{
22
+ "degree": "Bachelor's",
23
+ "field": "Computer Science",
24
+ "institution": "Tech University"
25
+ }],
26
+ certifications=[{
27
+ "name": "AWS Solutions Architect",
28
+ "organization": "AWS",
29
+ "start_date": "2022-01",
30
+ "end_date": "2025-01"
31
+ }]
32
+ )
33
 
34
+ # Transform and index into OpenSearch
35
+ jobseeker_id = "test_seeker_1"
36
+ payload = transform_jobseeker_to_opensearch(test_jobseeker, jobseeker_id)
37
+ response = opensearch_client.index_jobseeker(payload)
38
 
39
+ if response:
40
+ print(f"Successfully indexed jobseeker {jobseeker_id}: {response}")
41
+ else:
42
+ print(f"Failed to index jobseeker {jobseeker_id}")
 
 
 
43
 
44
+ def test_two_phase_search():
45
+ """Test the complete two-phase search pipeline"""
46
+ print("\nTesting two-phase search system...")
47
+
48
+ # Initialize components
49
+ chroma_matcher = ChromaMatchingSystem()
50
+ opensearch_client = OpenSearchClient()
51
+ search_system = TwoPhaseSearchSystem(chroma_matcher, opensearch_client)
 
 
 
 
 
 
 
 
52
 
53
+ # Check if OpenSearch database has any data
54
+ print("\nChecking if OpenSearch database has any data...")
55
+ test_payload = {
56
+ "boolean_search_query": "Software AND (Python OR AWS)", # Simple boolean query
57
+ "skills": [], # No skill requirements
58
+ "size": 100, # Retrieve up to 100 entries
59
+ "sort_by": ["score"]
60
+ }
61
+ opensearch_results = opensearch_client.search_jobseekers(test_payload)
62
+ if opensearch_results:
63
+ print(f"OpenSearch returned {len(opensearch_results)} entries.")
64
+ else:
65
+ print("OpenSearch database is empty or the query returned no results.")
66
 
67
+ # Get test data
68
+ (job_posting, matching_seeker, matching_info,
69
+ partial_matching_seeker, partial_matching_info,
70
+ non_matching_seeker, non_matching_info) = create_test_data()
71
+
72
+ print("\nAdding test seekers to ChromaDB...")
73
+ # Add test seekers to ChromaDB
74
+ test_seekers = [
75
+ ("seeker_1", matching_seeker, matching_info,
76
+ {"experience_level": "senior"}),
77
+ ("seeker_2", partial_matching_seeker, partial_matching_info,
78
+ {"experience_level": "mid"}),
79
+ ("seeker_3", non_matching_seeker, non_matching_info,
80
+ {"experience_level": "mid"})
81
+ ]
82
+
83
+ for seeker_id, processed, unprocessed, metadata in test_seekers:
84
+ print(f"Adding job seeker: {seeker_id}")
85
+ chroma_matcher.add_job_seeker(seeker_id, processed, unprocessed, metadata)
86
+
87
+ # Build search parameters using updated method
88
+ search_params = search_system.build_search_params(
89
+ job_posting=job_posting,
90
+ location={
91
+ "state": "CA",
92
+ "country": "US"
93
+ },
94
+ certifications={
95
+ "name": "AWS Solutions Architect",
96
+ "organization": "AWS"
97
+ },
98
+ tags=[
99
+ {
100
+ "tag_key": "seniority",
101
+ "tag_value": "senior"
102
+ }
103
+ ]
104
+ )
105
+
106
+ print("\nConstructed Search Parameters:")
107
+ print("=" * 60)
108
+ print(json.dumps(search_params, indent=2))
109
+
110
+ # Perform search
111
+ print("\nPerforming search...")
112
+ results = search_system.search_candidates(
113
+ job_posting=job_posting,
114
+ search_params=search_params,
115
+ n_results=10
116
+ )
117
+
118
+ # Print results
119
+ print("\nSearch Results:")
120
+ print("=" * 60)
121
+
122
+ if not results:
123
+ print("No matches found")
124
+ print("\nDebug: Checking OpenSearch response...")
125
+ opensearch_results = opensearch_client.search_jobseekers(search_params)
126
+ print(f"OpenSearch returned {len(opensearch_results)} results")
127
+ return
128
 
129
+ for i, result in enumerate(results, 1):
130
+ print(f"\nMatch {i}:")
131
+ print(f"Jobseeker ID: {result['jobseeker_id']}")
132
+ print(f"ChromaDB Score: {result['similarity_score']:.3f}")
133
+ print(f"OpenSearch Score: {result['opensearch_score']:.3f}")
134
+ print("\nField Scores:")
135
+ for field, score in result['field_scores'].items():
136
+ print(f" {field}: {score:.3f}")
137
+ print(f"\nExplanation: {result['explanation']}")
138
 
139
+ if __name__ == "__main__":
140
+ test_two_phase_search()