Spaces:

Anupam251272
/

legal_research

Running

File size: 22,292 Bytes

0b85946

import gradio as gr
import requests
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from datetime import datetime
import warnings
import gc
import re
import time
import random
import torch
from requests.exceptions import RequestException
import concurrent.futures
import json

warnings.filterwarnings('ignore')

class LegalResearchGenerator:
    def __init__(self):
        self.legal_categories = [
            "criminal", "civil", "constitutional", "corporate",
            "tax", "family", "property", "intellectual_property"
        ]
        
        self.doc_types = {
            "all": "",
            "central_acts": "central-acts",
            "state_acts": "state-acts",
            "regulations": "regulations",
            "ordinances": "ordinances",
            "constitutional_orders": "constitutional-orders"
        }
        
        # Initialize translation model only when needed
        self.translation_model = None 
        self.translation_tokenizer = None
        
        self.session = requests.Session()
        self.session.headers.update(self.get_random_headers())
        
        self.max_retries = 3
        self.retry_delay = 1
        
        # Initialize sentence transformer model
        try:
            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        except Exception as e:
            print(f"Error initializing sentence transformer: {e}")
            self.sentence_model = None

    def initialize_translation_model(self):
        """Initialize translation model only when needed"""
        if self.translation_model is None:
            try:
                self.translation_model_name = "Helsinki-NLP/opus-mt-en-hi"
                self.translation_model = MarianMTModel.from_pretrained(self.translation_model_name)
                self.translation_tokenizer = MarianTokenizer.from_pretrained(self.translation_model_name)
            except Exception as e:
                print(f"Error initializing translation model: {e}")
                return False
        return True

    def get_random_headers(self):
        """Generate random browser headers to avoid detection"""
        ua = UserAgent()
        browser_list = ['chrome', 'firefox', 'safari', 'edge']
        browser = random.choice(browser_list)
        
        headers = {
            'User-Agent': ua[browser],
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'DNT': '1'
        }
        return headers

    def calculate_relevance_score(self, query, text):
        """Calculate relevance score between query and text"""
        if not self.sentence_model:
            return 0.0
            
        try:
            query_embedding = self.sentence_model.encode([query])
            text_embedding = self.sentence_model.encode([text])
            
            similarity = float(torch.nn.functional.cosine_similarity(
                torch.tensor(query_embedding),
                torch.tensor(text_embedding)
            ))
            return max(0.0, min(1.0, similarity))  # Ensure score is between 0 and 1
            
        except Exception as e:
            print(f"Error calculating relevance score: {e}")
            return 0.0

    def clean_text(self, text):
        """Clean and format text content"""
        if not text:
            return ""
            
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text.strip())
        # Remove special characters
        text = re.sub(r'[^\w\s\.,;:?!-]', '', text)
        return text

    def format_legal_case(self, case_num, case_data, target_language='english'):
        """Format legal case data with improved layout"""
        try:
            title = self.translate_text(self.clean_text(case_data['title']), target_language)
            summary = self.translate_text(self.clean_text(case_data['summary']), target_language)
            source = case_data.get('source', 'Unknown Source')
            relevance = round(case_data.get('relevance_score', 0) * 100, 2)
            
            output = f"""
{'═' * 80}
📑 LEGAL DOCUMENT {case_num}
{'═' * 80}

📌 TITLE: 
{title}

📚 SOURCE: {source}
🎯 RELEVANCE: {relevance}%

📖 SUMMARY:
{summary}

🔗 DOCUMENT LINK:
{case_data['url']}

{'─' * 80}
"""
            return output
        except Exception as e:
            print(f"Error formatting legal case: {e}")
            return ""

    def translate_text(self, text, target_language):
        """Translate text to target language"""
        if target_language.lower() == "english":
            return text
            
        if not self.initialize_translation_model():
            return text
            
        try:
            inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            translated = self.translation_model.generate(**inputs)
            return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
        except Exception as e:
            print(f"Error during translation: {e}")
            return text

    def fetch_from_indiacode(self, query, doc_type="all", max_results=5):
        """Fetch results from India Code portal"""
        for attempt in range(self.max_retries):
            try:
                # Using a more reliable search endpoint
                base_url = "https://www.indiacode.nic.in/search"
                
                params = {
                    'q': query,
                    'type': self.doc_types.get(doc_type, ""),
                    'page': 1,
                    'size': max_results * 2
                }
                
                response = self.session.get(
                    base_url,
                    params=params,
                    headers=self.get_random_headers(),
                    timeout=15
                )
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    results = []
                    
                    items = (
                        soup.select('div.artifact-description') or
                        soup.select('.search-result-item') or
                        soup.select('.result-item')
                    )
                    
                    if not items:
                        print(f"No results found with current selectors. Attempt {attempt + 1}/{self.max_retries}")
                        continue
                    
                    for item in items:
                        try:
                            title_elem = (
                                item.select_one('h4.artifact-title a') or
                                item.select_one('.act-title') or
                                item.select_one('h3 a')
                            )
                            
                            title = title_elem.get_text(strip=True) if title_elem else "Untitled"
                            url = title_elem.get('href', '') if title_elem else ""
                            
                            summary_elem = (
                                item.select_one('div.artifact-info') or
                                item.select_one('.act-description') or
                                item.select_one('.summary')
                            )
                            summary = summary_elem.get_text(strip=True) if summary_elem else ""
                            
                            if not summary:
                                summary = ' '.join(text for text in item.stripped_strings
                                                if text != title and len(text) > 30)
                            
                            if url and not url.startswith('http'):
                                url = f"https://www.indiacode.nic.in{url}"
                            
                            relevance_score = self.calculate_relevance_score(
                                query,
                                f"{title} {summary}"
                            )
                            
                            results.append({
                                'title': title,
                                'court': 'India Code',
                                'summary': summary[:500],
                                'url': url,
                                'type': 'legal',
                                'source': 'India Code Portal',
                                'relevance_score': relevance_score
                            })
                            
                        except Exception as e:
                            print(f"Error processing result: {e}")
                            continue
                    
                    if results:
                        results.sort(key=lambda x: x['relevance_score'], reverse=True)
                        return results[:max_results]
                
                elif response.status_code == 429:
                    wait_time = self.retry_delay * (attempt + 1)
                    time.sleep(wait_time)
                    continue
                
            except Exception as e:
                print(f"Error on attempt {attempt + 1}: {e}")
                if attempt < self.max_retries - 1:
                    time.sleep(self.retry_delay)
                continue
        
        return []

    def fetch_from_liiofindia(self, query, doc_type="all", max_results=5):
        """Fetch results from LII of India"""
        try:
            # Updated to use the main search endpoint
            base_url = "https://www.liiofindia.org/search/"
            
            params = {
                'q': query,
                'page': 1,
                'per_page': max_results * 2,
                'sort': 'relevance'
            }
            
            if doc_type != "all":
                params['type'] = doc_type
            
            response = self.session.get(
                base_url,
                params=params,
                headers={
                    **self.get_random_headers(),
                    'Accept': 'application/json'
                },
                timeout=15
            )
            
            if response.status_code == 200:
                try:
                    data = response.json()
                    results = []
                    
                    for item in data.get('results', []):
                        title = item.get('title', 'Untitled')
                        summary = item.get('snippet', '')
                        
                        relevance_score = self.calculate_relevance_score(
                            query,
                            f"{title} {summary}"
                        )
                        
                        results.append({
                            'title': title,
                            'court': item.get('court', 'LII India'),
                            'summary': summary[:500],
                            'url': item.get('url', ''),
                            'type': 'legal',
                            'source': 'LII India',
                            'relevance_score': relevance_score
                        })
                    
                    results.sort(key=lambda x: x['relevance_score'], reverse=True)
                    return results[:max_results]
                    
                except ValueError as e:
                    print(f"Error parsing JSON from LII India: {e}")
                    return []
            
            return []
            
        except Exception as e:
            print(f"Error fetching from LII India: {e}")
            return []

    def fetch_alternative_source(self, query, max_results=5):
        """Fetch results from alternative sources"""
        try:
            # Try multiple alternative sources
            sources = [
                "https://indiankanoon.org/search/",
                "https://main.sci.gov.in/judgments",
                "https://doj.gov.in/acts-and-rules/"
            ]
            
            all_results = []
            for base_url in sources: # Added colon here
            
                params = {
                    'formInput': query,
                    'pageSize': max_results
                }
                
                response = self.session.get(
                    base_url,
                    params=params,
                    headers=self.get_random_headers(),
                    timeout=15
                )
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                results = []
                
                for result in soup.select('.result_item')[:max_results]:
                    try:
                        title_elem = result.select_one('.title a')
                        title = title_elem.get_text(strip=True) if title_elem else "Untitled"
                        url = title_elem.get('href', '') if title_elem else ""
                        
                        snippet_elem = result.select_one('.snippet')
                        summary = snippet_elem.get_text(strip=True) if snippet_elem else ""
                        
                        relevance_score = self.calculate_relevance_score(
                            query,
                            f"{title} {summary}"
                        )
                        
                        results.append({
                            'title': title,
                            'court': 'Alternative Source',
                            'summary': summary[:500],
                            'url': url if url.startswith('http') else f"https://indiankanoon.org{url}",
                            'type': 'legal',
                            'source': 'Indian Kanoon',
                            'relevance_score': relevance_score
                        })
                        
                    except Exception as e:
                        print(f"Error processing alternative result: {e}")
                        continue
                
                return results
            
        except Exception as e:
            print(f"Error in alternative source: {e}")
        
        return []

    def fetch_from_multiple_sources(self, query, doc_type="all", max_results=5):
        """Fetch and combine results from multiple sources"""
        all_results = []
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
            future_to_source = {
                executor.submit(self.fetch_from_indiacode, query, doc_type, max_results): "India Code",
                executor.submit(self.fetch_from_liiofindia, query, doc_type, max_results): "LII India",
                executor.submit(self.fetch_alternative_source, query, max_results): "Alternative"
            }
            
            for future in concurrent.futures.as_completed(future_to_source):
                source = future_to_source[future]
                try:
                    results = future.result()
                    if results:
                        all_results.extend(results)
                except Exception as e:
                    print(f"Error fetching from {source}: {e}")
        
        # Sort by relevance score and return top results
        all_results.sort(key=lambda x: x['relevance_score'], reverse=True)
        return all_results[:max_results]

    def process_research(self, input_query, research_type="legal", doc_type="all", target_language='english'):
        """Process research query and generate formatted output"""
        try:
            # Validate input
            if not input_query.strip():
                return "Error: Please enter a valid research query."
                
            # Add default sample data for testing and development
            sample_data = [
                {
                    'title': 'Right to Privacy Judgment',
                    'court': 'Supreme Court',
                    'summary': 'The right to privacy is protected as an intrinsic part of the right to life and personal liberty under Article 21 and as a part of the freedoms guaranteed by Part III of the Constitution.',
                    'url': 'https://main.sci.gov.in/supremecourt/2012/35071/35071_2012_Judgement_24-Aug-2017.pdf',
                    'type': 'legal',
                    'source': 'Supreme Court of India',
                    'relevance_score': 0.95
                },
                {
                    'title': 'Information Technology Act, 2000',
                    'court': 'India Code',
                    'summary': 'An Act to provide legal recognition for transactions carried out by means of electronic data interchange and other means of electronic communication.',
                    'url': 'https://www.indiacode.nic.in/handle/123456789/1999/simple-search',
                    'type': 'legal',
                    'source': 'India Code Portal',
                    'relevance_score': 0.85
                }
            ]
                
            # Fetch results
            cases = self.fetch_from_multiple_sources(input_query, doc_type)
            
            # If no results found from APIs, use sample data for development
            if not cases:
                print("No results from APIs, using sample data")
                cases = sample_data
                
            # Generate header
            header = f"""
{'╔' + '═' * 78 + '╗'}
║ {'LEGAL DOCUMENT ANALYSIS REPORT'.center(76)} ║
{'╠' + '═' * 78 + '╣'}
║
║ 🎯 RESEARCH TOPIC: {self.translate_text(input_query, target_language)}
║ 📅 GENERATED: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
║ 📚 DOCUMENTS FOUND: {len(cases)}
║ 🔍 SOURCES SEARCHED: India Code Portal, LII India, Indian Kanoon
║
{'╚' + '═' * 78 + '╝'}
"""
            
            # Generate body
            output_text = self.translate_text(header, target_language)
            for i, case in enumerate(cases, 1):
                output_text += self.format_legal_case(i, case, target_language)
                
            # Generate footer
            footer = f"""
{'═' * 80}
📊 RESEARCH INSIGHTS
{'═' * 80}

• Results are sorted by relevance to your query
• All information should be verified from original sources
• Use provided links to access complete documents

{'─' * 80}
"""
            output_text += self.translate_text(footer, target_language)
            return output_text
            
        except Exception as e:
            return f"An error occurred during research processing: {str(e)}"

    def clear_gpu_memory(self):
        """Clear GPU memory after processing"""
        try:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        except Exception as e:
            print(f"Error clearing GPU memory: {e}")

def create_gradio_interface():
    """Create Gradio interface with improved styling and error handling"""
    generator = LegalResearchGenerator()
    
    def process_input(input_text, research_type, doc_type, target_language, output_format):
        if not input_text.strip():
            return "Please enter a research topic to analyze."
            
        try:
            if output_format == "Text":
                result = generator.process_research(
                    input_text,
                    research_type,
                    doc_type,
                    target_language
                )
                generator.clear_gpu_memory()
                return result
            else:
                return "CSV output format is not implemented yet."
        except Exception as e:
            generator.clear_gpu_memory()
            return f"An error occurred: {str(e)}"
    
    css = """
    .gradio-container {
        font-family: 'Arial', sans-serif;
    }
    .output-text {
        font-family: 'Courier New', monospace;
        white-space: pre-wrap;
    }
    """
    
    iface = gr.Interface(
        fn=process_input,
        inputs=[
            gr.Textbox(
                label="Enter Research Topic",
                placeholder="e.g., 'privacy rights' or 'environmental protection'",
                lines=3
            ),
            gr.Radio(
                choices=["legal"],
                label="Research Type",
                value="legal"
            ),
            gr.Dropdown(
                choices=list(generator.doc_types.keys()),
                label="Document Type",
                value="all"
            ),
            gr.Dropdown(
                choices=["english", "hindi", "tamil", "bengali", "telugu"],
                label="Output Language",
                value="english"
            ),
            gr.Radio(
                choices=["Text", "CSV"],
                label="Output Format",
                value="Text"
            )
        ],
        outputs=gr.Textbox(
            label="Research Analysis Report",
            lines=30,
            elem_classes=["output-text"]
        ),
        title="🔬 Legal Research Analysis Tool",
        description="""
        Advanced legal research tool for Indian legal document analysis.
        • Multi-source search across legal databases
        • Smart filtering and relevance ranking
        • Multi-language support
        • Comprehensive research reports
        """,
        examples=[
            ["right to privacy", "legal", "central_acts", "english", "Text"],
            ["environmental protection", "legal", "regulations", "hindi", "Text"],
            ["digital rights", "legal", "constitutional_orders", "english", "Text"]
        ],
        css=css
    )
    
    return iface

if __name__ == "__main__":
    iface = create_gradio_interface()
    iface.launch(share=True)