import gradio as gr import requests import pandas as pd from transformers import MarianMTModel, MarianTokenizer from sentence_transformers import SentenceTransformer from bs4 import BeautifulSoup from fake_useragent import UserAgent from datetime import datetime import warnings import gc import re import time import random import torch from requests.exceptions import RequestException import concurrent.futures import json warnings.filterwarnings('ignore') class LegalResearchGenerator: def __init__(self): self.legal_categories = [ "criminal", "civil", "constitutional", "corporate", "tax", "family", "property", "intellectual_property" ] self.doc_types = { "all": "", "central_acts": "central-acts", "state_acts": "state-acts", "regulations": "regulations", "ordinances": "ordinances", "constitutional_orders": "constitutional-orders" } # Initialize translation model only when needed self.translation_model = None self.translation_tokenizer = None self.session = requests.Session() self.session.headers.update(self.get_random_headers()) self.max_retries = 3 self.retry_delay = 1 # Initialize sentence transformer model try: self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') except Exception as e: print(f"Error initializing sentence transformer: {e}") self.sentence_model = None def initialize_translation_model(self): """Initialize translation model only when needed""" if self.translation_model is None: try: self.translation_model_name = "Helsinki-NLP/opus-mt-en-hi" self.translation_model = MarianMTModel.from_pretrained(self.translation_model_name) self.translation_tokenizer = MarianTokenizer.from_pretrained(self.translation_model_name) except Exception as e: print(f"Error initializing translation model: {e}") return False return True def get_random_headers(self): """Generate random browser headers to avoid detection""" ua = UserAgent() browser_list = ['chrome', 'firefox', 'safari', 'edge'] browser = random.choice(browser_list) headers = { 'User-Agent': ua[browser], 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'DNT': '1' } return headers def calculate_relevance_score(self, query, text): """Calculate relevance score between query and text""" if not self.sentence_model: return 0.0 try: query_embedding = self.sentence_model.encode([query]) text_embedding = self.sentence_model.encode([text]) similarity = float(torch.nn.functional.cosine_similarity( torch.tensor(query_embedding), torch.tensor(text_embedding) )) return max(0.0, min(1.0, similarity)) # Ensure score is between 0 and 1 except Exception as e: print(f"Error calculating relevance score: {e}") return 0.0 def clean_text(self, text): """Clean and format text content""" if not text: return "" # Remove extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Remove special characters text = re.sub(r'[^\w\s\.,;:?!-]', '', text) return text def format_legal_case(self, case_num, case_data, target_language='english'): """Format legal case data with improved layout""" try: title = self.translate_text(self.clean_text(case_data['title']), target_language) summary = self.translate_text(self.clean_text(case_data['summary']), target_language) source = case_data.get('source', 'Unknown Source') relevance = round(case_data.get('relevance_score', 0) * 100, 2) output = f""" {'═' * 80} 📑 LEGAL DOCUMENT {case_num} {'═' * 80} 📌 TITLE: {title} 📚 SOURCE: {source} 🎯 RELEVANCE: {relevance}% 📖 SUMMARY: {summary} 🔗 DOCUMENT LINK: {case_data['url']} {'─' * 80} """ return output except Exception as e: print(f"Error formatting legal case: {e}") return "" def translate_text(self, text, target_language): """Translate text to target language""" if target_language.lower() == "english": return text if not self.initialize_translation_model(): return text try: inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) translated = self.translation_model.generate(**inputs) return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: print(f"Error during translation: {e}") return text def fetch_from_indiacode(self, query, doc_type="all", max_results=5): """Fetch results from India Code portal""" for attempt in range(self.max_retries): try: # Using a more reliable search endpoint base_url = "https://www.indiacode.nic.in/search" params = { 'q': query, 'type': self.doc_types.get(doc_type, ""), 'page': 1, 'size': max_results * 2 } response = self.session.get( base_url, params=params, headers=self.get_random_headers(), timeout=15 ) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') results = [] items = ( soup.select('div.artifact-description') or soup.select('.search-result-item') or soup.select('.result-item') ) if not items: print(f"No results found with current selectors. Attempt {attempt + 1}/{self.max_retries}") continue for item in items: try: title_elem = ( item.select_one('h4.artifact-title a') or item.select_one('.act-title') or item.select_one('h3 a') ) title = title_elem.get_text(strip=True) if title_elem else "Untitled" url = title_elem.get('href', '') if title_elem else "" summary_elem = ( item.select_one('div.artifact-info') or item.select_one('.act-description') or item.select_one('.summary') ) summary = summary_elem.get_text(strip=True) if summary_elem else "" if not summary: summary = ' '.join(text for text in item.stripped_strings if text != title and len(text) > 30) if url and not url.startswith('http'): url = f"https://www.indiacode.nic.in{url}" relevance_score = self.calculate_relevance_score( query, f"{title} {summary}" ) results.append({ 'title': title, 'court': 'India Code', 'summary': summary[:500], 'url': url, 'type': 'legal', 'source': 'India Code Portal', 'relevance_score': relevance_score }) except Exception as e: print(f"Error processing result: {e}") continue if results: results.sort(key=lambda x: x['relevance_score'], reverse=True) return results[:max_results] elif response.status_code == 429: wait_time = self.retry_delay * (attempt + 1) time.sleep(wait_time) continue except Exception as e: print(f"Error on attempt {attempt + 1}: {e}") if attempt < self.max_retries - 1: time.sleep(self.retry_delay) continue return [] def fetch_from_liiofindia(self, query, doc_type="all", max_results=5): """Fetch results from LII of India""" try: # Updated to use the main search endpoint base_url = "https://www.liiofindia.org/search/" params = { 'q': query, 'page': 1, 'per_page': max_results * 2, 'sort': 'relevance' } if doc_type != "all": params['type'] = doc_type response = self.session.get( base_url, params=params, headers={ **self.get_random_headers(), 'Accept': 'application/json' }, timeout=15 ) if response.status_code == 200: try: data = response.json() results = [] for item in data.get('results', []): title = item.get('title', 'Untitled') summary = item.get('snippet', '') relevance_score = self.calculate_relevance_score( query, f"{title} {summary}" ) results.append({ 'title': title, 'court': item.get('court', 'LII India'), 'summary': summary[:500], 'url': item.get('url', ''), 'type': 'legal', 'source': 'LII India', 'relevance_score': relevance_score }) results.sort(key=lambda x: x['relevance_score'], reverse=True) return results[:max_results] except ValueError as e: print(f"Error parsing JSON from LII India: {e}") return [] return [] except Exception as e: print(f"Error fetching from LII India: {e}") return [] def fetch_alternative_source(self, query, max_results=5): """Fetch results from alternative sources""" try: # Try multiple alternative sources sources = [ "https://indiankanoon.org/search/", "https://main.sci.gov.in/judgments", "https://doj.gov.in/acts-and-rules/" ] all_results = [] for base_url in sources: # Added colon here params = { 'formInput': query, 'pageSize': max_results } response = self.session.get( base_url, params=params, headers=self.get_random_headers(), timeout=15 ) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') results = [] for result in soup.select('.result_item')[:max_results]: try: title_elem = result.select_one('.title a') title = title_elem.get_text(strip=True) if title_elem else "Untitled" url = title_elem.get('href', '') if title_elem else "" snippet_elem = result.select_one('.snippet') summary = snippet_elem.get_text(strip=True) if snippet_elem else "" relevance_score = self.calculate_relevance_score( query, f"{title} {summary}" ) results.append({ 'title': title, 'court': 'Alternative Source', 'summary': summary[:500], 'url': url if url.startswith('http') else f"https://indiankanoon.org{url}", 'type': 'legal', 'source': 'Indian Kanoon', 'relevance_score': relevance_score }) except Exception as e: print(f"Error processing alternative result: {e}") continue return results except Exception as e: print(f"Error in alternative source: {e}") return [] def fetch_from_multiple_sources(self, query, doc_type="all", max_results=5): """Fetch and combine results from multiple sources""" all_results = [] with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: future_to_source = { executor.submit(self.fetch_from_indiacode, query, doc_type, max_results): "India Code", executor.submit(self.fetch_from_liiofindia, query, doc_type, max_results): "LII India", executor.submit(self.fetch_alternative_source, query, max_results): "Alternative" } for future in concurrent.futures.as_completed(future_to_source): source = future_to_source[future] try: results = future.result() if results: all_results.extend(results) except Exception as e: print(f"Error fetching from {source}: {e}") # Sort by relevance score and return top results all_results.sort(key=lambda x: x['relevance_score'], reverse=True) return all_results[:max_results] def process_research(self, input_query, research_type="legal", doc_type="all", target_language='english'): """Process research query and generate formatted output""" try: # Validate input if not input_query.strip(): return "Error: Please enter a valid research query." # Add default sample data for testing and development sample_data = [ { 'title': 'Right to Privacy Judgment', 'court': 'Supreme Court', 'summary': 'The right to privacy is protected as an intrinsic part of the right to life and personal liberty under Article 21 and as a part of the freedoms guaranteed by Part III of the Constitution.', 'url': 'https://main.sci.gov.in/supremecourt/2012/35071/35071_2012_Judgement_24-Aug-2017.pdf', 'type': 'legal', 'source': 'Supreme Court of India', 'relevance_score': 0.95 }, { 'title': 'Information Technology Act, 2000', 'court': 'India Code', 'summary': 'An Act to provide legal recognition for transactions carried out by means of electronic data interchange and other means of electronic communication.', 'url': 'https://www.indiacode.nic.in/handle/123456789/1999/simple-search', 'type': 'legal', 'source': 'India Code Portal', 'relevance_score': 0.85 } ] # Fetch results cases = self.fetch_from_multiple_sources(input_query, doc_type) # If no results found from APIs, use sample data for development if not cases: print("No results from APIs, using sample data") cases = sample_data # Generate header header = f""" {'╔' + '═' * 78 + '╗'} ║ {'LEGAL DOCUMENT ANALYSIS REPORT'.center(76)} ║ {'╠' + '═' * 78 + '╣'} ║ ║ 🎯 RESEARCH TOPIC: {self.translate_text(input_query, target_language)} ║ 📅 GENERATED: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ║ 📚 DOCUMENTS FOUND: {len(cases)} ║ 🔍 SOURCES SEARCHED: India Code Portal, LII India, Indian Kanoon ║ {'╚' + '═' * 78 + '╝'} """ # Generate body output_text = self.translate_text(header, target_language) for i, case in enumerate(cases, 1): output_text += self.format_legal_case(i, case, target_language) # Generate footer footer = f""" {'═' * 80} 📊 RESEARCH INSIGHTS {'═' * 80} • Results are sorted by relevance to your query • All information should be verified from original sources • Use provided links to access complete documents {'─' * 80} """ output_text += self.translate_text(footer, target_language) return output_text except Exception as e: return f"An error occurred during research processing: {str(e)}" def clear_gpu_memory(self): """Clear GPU memory after processing""" try: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: print(f"Error clearing GPU memory: {e}") def create_gradio_interface(): """Create Gradio interface with improved styling and error handling""" generator = LegalResearchGenerator() def process_input(input_text, research_type, doc_type, target_language, output_format): if not input_text.strip(): return "Please enter a research topic to analyze." try: if output_format == "Text": result = generator.process_research( input_text, research_type, doc_type, target_language ) generator.clear_gpu_memory() return result else: return "CSV output format is not implemented yet." except Exception as e: generator.clear_gpu_memory() return f"An error occurred: {str(e)}" css = """ .gradio-container { font-family: 'Arial', sans-serif; } .output-text { font-family: 'Courier New', monospace; white-space: pre-wrap; } """ iface = gr.Interface( fn=process_input, inputs=[ gr.Textbox( label="Enter Research Topic", placeholder="e.g., 'privacy rights' or 'environmental protection'", lines=3 ), gr.Radio( choices=["legal"], label="Research Type", value="legal" ), gr.Dropdown( choices=list(generator.doc_types.keys()), label="Document Type", value="all" ), gr.Dropdown( choices=["english", "hindi", "tamil", "bengali", "telugu"], label="Output Language", value="english" ), gr.Radio( choices=["Text", "CSV"], label="Output Format", value="Text" ) ], outputs=gr.Textbox( label="Research Analysis Report", lines=30, elem_classes=["output-text"] ), title="🔬 Legal Research Analysis Tool", description=""" Advanced legal research tool for Indian legal document analysis. • Multi-source search across legal databases • Smart filtering and relevance ranking • Multi-language support • Comprehensive research reports """, examples=[ ["right to privacy", "legal", "central_acts", "english", "Text"], ["environmental protection", "legal", "regulations", "hindi", "Text"], ["digital rights", "legal", "constitutional_orders", "english", "Text"] ], css=css ) return iface if __name__ == "__main__": iface = create_gradio_interface() iface.launch(share=True)