Spaces:

cstr
/

PDF-Summarizer

Running

App Files Files Community

cstr commited on Dec 6, 2024

Commit

0ae08d5

verified ·

1 Parent(s): d3c2351

Create functions.py

Browse files

Files changed (1) hide show

functions.py +297 -0

functions.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import logging
+from pathlib import Path
+from typing import List, Dict, Union, Optional
+import re
+import openai
+import requests
+from PyPDF2 import PdfReader
+from gradio_client import Client
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extract text from a PDF file with robust error handling.
+    Args:
+        file_path: Path to the PDF file
+    Returns:
+        Extracted text as a string
+    Raises:
+        ValueError: If file doesn't exist or isn't readable
+        RuntimeError: If text extraction fails
+    """
+    try:
+        if not Path(file_path).exists():
+            raise ValueError(f"PDF file not found: {file_path}")
+        reader = PdfReader(file_path)
+        text_content = []
+        for page_num, page in enumerate(reader.pages, 1):
+            try:
+                text = page.extract_text()
+                if text.strip():
+                    text_content.append(text)
+                else:
+                    logger.warning(f"Page {page_num} appears to be empty or unreadable")
+            except Exception as e:
+                logger.error(f"Error extracting text from page {page_num}: {str(e)}")
+                continue
+        if not text_content:
+            raise RuntimeError("No readable text found in PDF")
+        return "\n\n".join(text_content)
+    except Exception as e:
+        logger.error(f"PDF extraction failed: {str(e)}")
+        raise RuntimeError(f"Failed to process PDF: {str(e)}")
+def format_content(text: str, format_type: str) -> str:
+    """
+    Format extracted text into the specified output format.
+    Args:
+        text: Raw text content
+        format_type: Output format ('txt', 'md', 'html')
+    Returns:
+        Formatted text string
+    Raises:
+        ValueError: If format type is invalid
+    """
+    if not isinstance(text, str):
+        raise ValueError("Input text must be a string")
+    # Clean up common PDF extraction artifacts
+    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+    text = re.sub(r'(?<=[.!?])\s+', '\n\n', text)  # Split sentences into paragraphs
+    text = text.strip()
+    if format_type.lower() == 'txt':
+        return text
+    elif format_type.lower() == 'md':
+        paragraphs = text.split('\n\n')
+        md_text = []
+        for para in paragraphs:
+            # Detect and format headers
+            if re.match(r'^[A-Z][^.!?]*$', para.strip()):
+                md_text.append(f"## {para.strip()}")
+            else:
+                md_text.append(para.strip())
+        return '\n\n'.join(md_text)
+    elif format_type.lower() == 'html':
+        paragraphs = text.split('\n\n')
+        html_parts = ['<!DOCTYPE html>', '<html>', '<body>']
+        for para in paragraphs:
+            if re.match(r'^[A-Z][^.!?]*$', para.strip()):
+                html_parts.append(f"<h2>{para.strip()}</h2>")
+            else:
+                html_parts.append(f"<p>{para.strip()}</p>")
+        html_parts.extend(['</body>', '</html>'])
+        return '\n'.join(html_parts)
+    else:
+        raise ValueError(f"Unsupported format type: {format_type}")
+def split_into_snippets(text: str, chunk_size: int = 4000, overlap: int = 200) -> List[str]:
+    """
+    Split text into overlapping chunks that fit within model context windows.
+    Args:
+        text: Input text to split
+        chunk_size: Maximum size of each chunk
+        overlap: Number of characters to overlap between chunks
+    Returns:
+        List of text snippets
+    Raises:
+        ValueError: If chunk_size is too small or text is empty
+    """
+    if not text:
+        raise ValueError("Input text is empty")
+    if chunk_size < 1000:
+        raise ValueError("Chunk size must be at least 1000 characters")
+    # Split into paragraphs first
+    paragraphs = text.split('\n\n')
+    chunks = []
+    current_chunk = []
+    current_size = 0
+    for para in paragraphs:
+        para_size = len(para)
+        if current_size + para_size <= chunk_size:
+            current_chunk.append(para)
+            current_size += para_size + 2  # +2 for newlines
+        else:
+            if current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+            # Start new chunk with overlap
+            if chunks:
+                overlap_text = chunks[-1][-overlap:] if overlap > 0 else ""
+                current_chunk = [overlap_text, para]
+                current_size = len(overlap_text) + para_size + 2
+            else:
+                current_chunk = [para]
+                current_size = para_size
+    # Add the last chunk if it exists
+    if current_chunk:
+        chunks.append('\n\n'.join(current_chunk))
+    return chunks
+def build_prompts(chunks: List[str], custom_prompt: Optional[str] = None) -> List[str]:
+    """
+    Build formatted prompts for each text chunk.
+    Args:
+        chunks: List of text chunks
+        custom_prompt: Optional custom instruction
+    Returns:
+        List of formatted prompt strings
+    """
+    default_prompt = """Please analyze and summarize the following text. Focus on:
+1. Key points and main ideas
+2. Important details and supporting evidence
+3. Any conclusions or recommendations
+Please maintain the original meaning while being concise."""
+    instruction = custom_prompt if custom_prompt else default_prompt
+    prompts = []
+    for i, chunk in enumerate(chunks, 1):
+        prompt = f"""### Instruction
+{instruction}
+### Input Text (Part {i} of {len(chunks)})
+{chunk}
+### End of Input Text
+Please provide your summary below:"""
+        prompts.append(prompt)
+    return prompts
+def process_with_model(
+    prompt: str,
+    model_choice: str,
+    api_key: Optional[str] = None,
+    oauth_token: Optional[str] = None
+) -> str:
+    """
+    Process text with selected model.
+    Args:
+        prompt: Input prompt
+        model_choice: Selected model name
+        api_key: OpenAI API key for GPT models
+        oauth_token: Hugging Face token for other models
+    Returns:
+        Generated summary
+    Raises:
+        ValueError: If required credentials are missing
+        RuntimeError: If model processing fails
+    """
+    try:
+        if 'gpt' in model_choice.lower():
+            if not api_key:
+                raise ValueError("OpenAI API key required for GPT models")
+            openai.api_key = api_key
+            response = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo" if "3.5" in model_choice else "gpt-4",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.7,
+                max_tokens=1500
+            )
+            return response.choices[0].message.content
+        else:  # Hugging Face models
+            if not oauth_token:
+                raise ValueError("Hugging Face token required")
+            headers = {"Authorization": f"Bearer {oauth_token}"}
+            # Map model choice to actual model ID
+            model_map = {
+                "Claude-3": "anthropic/claude-3-opus-20240229",
+                "Mistral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
+            }
+            model_id = model_map.get(model_choice)
+            if not model_id:
+                raise ValueError(f"Unknown model: {model_choice}")
+            response = requests.post(
+                f"https://api-inference.huggingface.co/models/{model_id}",
+                headers=headers,
+                json={"inputs": prompt}
+            )
+            if response.status_code != 200:
+                raise RuntimeError(f"Model API error: {response.text}")
+            return response.json()[0]["generated_text"]
+    except Exception as e:
+        logger.error(f"Model processing failed: {str(e)}")
+        raise RuntimeError(f"Failed to process with model: {str(e)}")
+def validate_api_keys(openai_key: Optional[str] = None, hf_token: Optional[str] = None) -> Dict[str, bool]:
+    """
+    Validate API keys for different services.
+    Args:
+        openai_key: OpenAI API key
+        hf_token: Hugging Face token
+    Returns:
+        Dictionary with validation results
+    """
+    results = {"openai": False, "huggingface": False}
+    if openai_key:
+        try:
+            openai.api_key = openai_key
+            openai.Model.list()
+            results["openai"] = True
+        except:
+            pass
+    if hf_token:
+        try:
+            response = requests.get(
+                "https://huggingface.co/api/models",
+                headers={"Authorization": f"Bearer {hf_token}"}
+            )
+            results["huggingface"] = response.status_code == 200
+        except:
+            pass
+    return results