legal_research / app.py
Anupam251272's picture
Create app.py
0b85946 verified
import gradio as gr
import requests
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from datetime import datetime
import warnings
import gc
import re
import time
import random
import torch
from requests.exceptions import RequestException
import concurrent.futures
import json
warnings.filterwarnings('ignore')
class LegalResearchGenerator:
def __init__(self):
self.legal_categories = [
"criminal", "civil", "constitutional", "corporate",
"tax", "family", "property", "intellectual_property"
]
self.doc_types = {
"all": "",
"central_acts": "central-acts",
"state_acts": "state-acts",
"regulations": "regulations",
"ordinances": "ordinances",
"constitutional_orders": "constitutional-orders"
}
# Initialize translation model only when needed
self.translation_model = None
self.translation_tokenizer = None
self.session = requests.Session()
self.session.headers.update(self.get_random_headers())
self.max_retries = 3
self.retry_delay = 1
# Initialize sentence transformer model
try:
self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
print(f"Error initializing sentence transformer: {e}")
self.sentence_model = None
def initialize_translation_model(self):
"""Initialize translation model only when needed"""
if self.translation_model is None:
try:
self.translation_model_name = "Helsinki-NLP/opus-mt-en-hi"
self.translation_model = MarianMTModel.from_pretrained(self.translation_model_name)
self.translation_tokenizer = MarianTokenizer.from_pretrained(self.translation_model_name)
except Exception as e:
print(f"Error initializing translation model: {e}")
return False
return True
def get_random_headers(self):
"""Generate random browser headers to avoid detection"""
ua = UserAgent()
browser_list = ['chrome', 'firefox', 'safari', 'edge']
browser = random.choice(browser_list)
headers = {
'User-Agent': ua[browser],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'DNT': '1'
}
return headers
def calculate_relevance_score(self, query, text):
"""Calculate relevance score between query and text"""
if not self.sentence_model:
return 0.0
try:
query_embedding = self.sentence_model.encode([query])
text_embedding = self.sentence_model.encode([text])
similarity = float(torch.nn.functional.cosine_similarity(
torch.tensor(query_embedding),
torch.tensor(text_embedding)
))
return max(0.0, min(1.0, similarity)) # Ensure score is between 0 and 1
except Exception as e:
print(f"Error calculating relevance score: {e}")
return 0.0
def clean_text(self, text):
"""Clean and format text content"""
if not text:
return ""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove special characters
text = re.sub(r'[^\w\s\.,;:?!-]', '', text)
return text
def format_legal_case(self, case_num, case_data, target_language='english'):
"""Format legal case data with improved layout"""
try:
title = self.translate_text(self.clean_text(case_data['title']), target_language)
summary = self.translate_text(self.clean_text(case_data['summary']), target_language)
source = case_data.get('source', 'Unknown Source')
relevance = round(case_data.get('relevance_score', 0) * 100, 2)
output = f"""
{'═' * 80}
πŸ“‘ LEGAL DOCUMENT {case_num}
{'═' * 80}
πŸ“Œ TITLE:
{title}
πŸ“š SOURCE: {source}
🎯 RELEVANCE: {relevance}%
πŸ“– SUMMARY:
{summary}
πŸ”— DOCUMENT LINK:
{case_data['url']}
{'─' * 80}
"""
return output
except Exception as e:
print(f"Error formatting legal case: {e}")
return ""
def translate_text(self, text, target_language):
"""Translate text to target language"""
if target_language.lower() == "english":
return text
if not self.initialize_translation_model():
return text
try:
inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
translated = self.translation_model.generate(**inputs)
return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
except Exception as e:
print(f"Error during translation: {e}")
return text
def fetch_from_indiacode(self, query, doc_type="all", max_results=5):
"""Fetch results from India Code portal"""
for attempt in range(self.max_retries):
try:
# Using a more reliable search endpoint
base_url = "https://www.indiacode.nic.in/search"
params = {
'q': query,
'type': self.doc_types.get(doc_type, ""),
'page': 1,
'size': max_results * 2
}
response = self.session.get(
base_url,
params=params,
headers=self.get_random_headers(),
timeout=15
)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
results = []
items = (
soup.select('div.artifact-description') or
soup.select('.search-result-item') or
soup.select('.result-item')
)
if not items:
print(f"No results found with current selectors. Attempt {attempt + 1}/{self.max_retries}")
continue
for item in items:
try:
title_elem = (
item.select_one('h4.artifact-title a') or
item.select_one('.act-title') or
item.select_one('h3 a')
)
title = title_elem.get_text(strip=True) if title_elem else "Untitled"
url = title_elem.get('href', '') if title_elem else ""
summary_elem = (
item.select_one('div.artifact-info') or
item.select_one('.act-description') or
item.select_one('.summary')
)
summary = summary_elem.get_text(strip=True) if summary_elem else ""
if not summary:
summary = ' '.join(text for text in item.stripped_strings
if text != title and len(text) > 30)
if url and not url.startswith('http'):
url = f"https://www.indiacode.nic.in{url}"
relevance_score = self.calculate_relevance_score(
query,
f"{title} {summary}"
)
results.append({
'title': title,
'court': 'India Code',
'summary': summary[:500],
'url': url,
'type': 'legal',
'source': 'India Code Portal',
'relevance_score': relevance_score
})
except Exception as e:
print(f"Error processing result: {e}")
continue
if results:
results.sort(key=lambda x: x['relevance_score'], reverse=True)
return results[:max_results]
elif response.status_code == 429:
wait_time = self.retry_delay * (attempt + 1)
time.sleep(wait_time)
continue
except Exception as e:
print(f"Error on attempt {attempt + 1}: {e}")
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
continue
return []
def fetch_from_liiofindia(self, query, doc_type="all", max_results=5):
"""Fetch results from LII of India"""
try:
# Updated to use the main search endpoint
base_url = "https://www.liiofindia.org/search/"
params = {
'q': query,
'page': 1,
'per_page': max_results * 2,
'sort': 'relevance'
}
if doc_type != "all":
params['type'] = doc_type
response = self.session.get(
base_url,
params=params,
headers={
**self.get_random_headers(),
'Accept': 'application/json'
},
timeout=15
)
if response.status_code == 200:
try:
data = response.json()
results = []
for item in data.get('results', []):
title = item.get('title', 'Untitled')
summary = item.get('snippet', '')
relevance_score = self.calculate_relevance_score(
query,
f"{title} {summary}"
)
results.append({
'title': title,
'court': item.get('court', 'LII India'),
'summary': summary[:500],
'url': item.get('url', ''),
'type': 'legal',
'source': 'LII India',
'relevance_score': relevance_score
})
results.sort(key=lambda x: x['relevance_score'], reverse=True)
return results[:max_results]
except ValueError as e:
print(f"Error parsing JSON from LII India: {e}")
return []
return []
except Exception as e:
print(f"Error fetching from LII India: {e}")
return []
def fetch_alternative_source(self, query, max_results=5):
"""Fetch results from alternative sources"""
try:
# Try multiple alternative sources
sources = [
"https://indiankanoon.org/search/",
"https://main.sci.gov.in/judgments",
"https://doj.gov.in/acts-and-rules/"
]
all_results = []
for base_url in sources: # Added colon here
params = {
'formInput': query,
'pageSize': max_results
}
response = self.session.get(
base_url,
params=params,
headers=self.get_random_headers(),
timeout=15
)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for result in soup.select('.result_item')[:max_results]:
try:
title_elem = result.select_one('.title a')
title = title_elem.get_text(strip=True) if title_elem else "Untitled"
url = title_elem.get('href', '') if title_elem else ""
snippet_elem = result.select_one('.snippet')
summary = snippet_elem.get_text(strip=True) if snippet_elem else ""
relevance_score = self.calculate_relevance_score(
query,
f"{title} {summary}"
)
results.append({
'title': title,
'court': 'Alternative Source',
'summary': summary[:500],
'url': url if url.startswith('http') else f"https://indiankanoon.org{url}",
'type': 'legal',
'source': 'Indian Kanoon',
'relevance_score': relevance_score
})
except Exception as e:
print(f"Error processing alternative result: {e}")
continue
return results
except Exception as e:
print(f"Error in alternative source: {e}")
return []
def fetch_from_multiple_sources(self, query, doc_type="all", max_results=5):
"""Fetch and combine results from multiple sources"""
all_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
future_to_source = {
executor.submit(self.fetch_from_indiacode, query, doc_type, max_results): "India Code",
executor.submit(self.fetch_from_liiofindia, query, doc_type, max_results): "LII India",
executor.submit(self.fetch_alternative_source, query, max_results): "Alternative"
}
for future in concurrent.futures.as_completed(future_to_source):
source = future_to_source[future]
try:
results = future.result()
if results:
all_results.extend(results)
except Exception as e:
print(f"Error fetching from {source}: {e}")
# Sort by relevance score and return top results
all_results.sort(key=lambda x: x['relevance_score'], reverse=True)
return all_results[:max_results]
def process_research(self, input_query, research_type="legal", doc_type="all", target_language='english'):
"""Process research query and generate formatted output"""
try:
# Validate input
if not input_query.strip():
return "Error: Please enter a valid research query."
# Add default sample data for testing and development
sample_data = [
{
'title': 'Right to Privacy Judgment',
'court': 'Supreme Court',
'summary': 'The right to privacy is protected as an intrinsic part of the right to life and personal liberty under Article 21 and as a part of the freedoms guaranteed by Part III of the Constitution.',
'url': 'https://main.sci.gov.in/supremecourt/2012/35071/35071_2012_Judgement_24-Aug-2017.pdf',
'type': 'legal',
'source': 'Supreme Court of India',
'relevance_score': 0.95
},
{
'title': 'Information Technology Act, 2000',
'court': 'India Code',
'summary': 'An Act to provide legal recognition for transactions carried out by means of electronic data interchange and other means of electronic communication.',
'url': 'https://www.indiacode.nic.in/handle/123456789/1999/simple-search',
'type': 'legal',
'source': 'India Code Portal',
'relevance_score': 0.85
}
]
# Fetch results
cases = self.fetch_from_multiple_sources(input_query, doc_type)
# If no results found from APIs, use sample data for development
if not cases:
print("No results from APIs, using sample data")
cases = sample_data
# Generate header
header = f"""
{'β•”' + '═' * 78 + 'β•—'}
β•‘ {'LEGAL DOCUMENT ANALYSIS REPORT'.center(76)} β•‘
{'β• ' + '═' * 78 + 'β•£'}
β•‘
β•‘ 🎯 RESEARCH TOPIC: {self.translate_text(input_query, target_language)}
β•‘ πŸ“… GENERATED: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
β•‘ πŸ“š DOCUMENTS FOUND: {len(cases)}
β•‘ πŸ” SOURCES SEARCHED: India Code Portal, LII India, Indian Kanoon
β•‘
{'β•š' + '═' * 78 + '╝'}
"""
# Generate body
output_text = self.translate_text(header, target_language)
for i, case in enumerate(cases, 1):
output_text += self.format_legal_case(i, case, target_language)
# Generate footer
footer = f"""
{'═' * 80}
πŸ“Š RESEARCH INSIGHTS
{'═' * 80}
β€’ Results are sorted by relevance to your query
β€’ All information should be verified from original sources
β€’ Use provided links to access complete documents
{'─' * 80}
"""
output_text += self.translate_text(footer, target_language)
return output_text
except Exception as e:
return f"An error occurred during research processing: {str(e)}"
def clear_gpu_memory(self):
"""Clear GPU memory after processing"""
try:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"Error clearing GPU memory: {e}")
def create_gradio_interface():
"""Create Gradio interface with improved styling and error handling"""
generator = LegalResearchGenerator()
def process_input(input_text, research_type, doc_type, target_language, output_format):
if not input_text.strip():
return "Please enter a research topic to analyze."
try:
if output_format == "Text":
result = generator.process_research(
input_text,
research_type,
doc_type,
target_language
)
generator.clear_gpu_memory()
return result
else:
return "CSV output format is not implemented yet."
except Exception as e:
generator.clear_gpu_memory()
return f"An error occurred: {str(e)}"
css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.output-text {
font-family: 'Courier New', monospace;
white-space: pre-wrap;
}
"""
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Textbox(
label="Enter Research Topic",
placeholder="e.g., 'privacy rights' or 'environmental protection'",
lines=3
),
gr.Radio(
choices=["legal"],
label="Research Type",
value="legal"
),
gr.Dropdown(
choices=list(generator.doc_types.keys()),
label="Document Type",
value="all"
),
gr.Dropdown(
choices=["english", "hindi", "tamil", "bengali", "telugu"],
label="Output Language",
value="english"
),
gr.Radio(
choices=["Text", "CSV"],
label="Output Format",
value="Text"
)
],
outputs=gr.Textbox(
label="Research Analysis Report",
lines=30,
elem_classes=["output-text"]
),
title="πŸ”¬ Legal Research Analysis Tool",
description="""
Advanced legal research tool for Indian legal document analysis.
β€’ Multi-source search across legal databases
β€’ Smart filtering and relevance ranking
β€’ Multi-language support
β€’ Comprehensive research reports
""",
examples=[
["right to privacy", "legal", "central_acts", "english", "Text"],
["environmental protection", "legal", "regulations", "hindi", "Text"],
["digital rights", "legal", "constitutional_orders", "english", "Text"]
],
css=css
)
return iface
if __name__ == "__main__":
iface = create_gradio_interface()
iface.launch(share=True)