cstr commited on
Commit
0ae08d5
1 Parent(s): d3c2351

Create functions.py

Browse files
Files changed (1) hide show
  1. functions.py +297 -0
functions.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import List, Dict, Union, Optional
4
+ import re
5
+ import openai
6
+ import requests
7
+ from PyPDF2 import PdfReader
8
+ from gradio_client import Client
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def extract_text_from_pdf(file_path: str) -> str:
18
+ """
19
+ Extract text from a PDF file with robust error handling.
20
+
21
+ Args:
22
+ file_path: Path to the PDF file
23
+
24
+ Returns:
25
+ Extracted text as a string
26
+
27
+ Raises:
28
+ ValueError: If file doesn't exist or isn't readable
29
+ RuntimeError: If text extraction fails
30
+ """
31
+ try:
32
+ if not Path(file_path).exists():
33
+ raise ValueError(f"PDF file not found: {file_path}")
34
+
35
+ reader = PdfReader(file_path)
36
+ text_content = []
37
+
38
+ for page_num, page in enumerate(reader.pages, 1):
39
+ try:
40
+ text = page.extract_text()
41
+ if text.strip():
42
+ text_content.append(text)
43
+ else:
44
+ logger.warning(f"Page {page_num} appears to be empty or unreadable")
45
+ except Exception as e:
46
+ logger.error(f"Error extracting text from page {page_num}: {str(e)}")
47
+ continue
48
+
49
+ if not text_content:
50
+ raise RuntimeError("No readable text found in PDF")
51
+
52
+ return "\n\n".join(text_content)
53
+
54
+ except Exception as e:
55
+ logger.error(f"PDF extraction failed: {str(e)}")
56
+ raise RuntimeError(f"Failed to process PDF: {str(e)}")
57
+
58
+ def format_content(text: str, format_type: str) -> str:
59
+ """
60
+ Format extracted text into the specified output format.
61
+
62
+ Args:
63
+ text: Raw text content
64
+ format_type: Output format ('txt', 'md', 'html')
65
+
66
+ Returns:
67
+ Formatted text string
68
+
69
+ Raises:
70
+ ValueError: If format type is invalid
71
+ """
72
+ if not isinstance(text, str):
73
+ raise ValueError("Input text must be a string")
74
+
75
+ # Clean up common PDF extraction artifacts
76
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
77
+ text = re.sub(r'(?<=[.!?])\s+', '\n\n', text) # Split sentences into paragraphs
78
+ text = text.strip()
79
+
80
+ if format_type.lower() == 'txt':
81
+ return text
82
+
83
+ elif format_type.lower() == 'md':
84
+ paragraphs = text.split('\n\n')
85
+ md_text = []
86
+
87
+ for para in paragraphs:
88
+ # Detect and format headers
89
+ if re.match(r'^[A-Z][^.!?]*$', para.strip()):
90
+ md_text.append(f"## {para.strip()}")
91
+ else:
92
+ md_text.append(para.strip())
93
+
94
+ return '\n\n'.join(md_text)
95
+
96
+ elif format_type.lower() == 'html':
97
+ paragraphs = text.split('\n\n')
98
+ html_parts = ['<!DOCTYPE html>', '<html>', '<body>']
99
+
100
+ for para in paragraphs:
101
+ if re.match(r'^[A-Z][^.!?]*$', para.strip()):
102
+ html_parts.append(f"<h2>{para.strip()}</h2>")
103
+ else:
104
+ html_parts.append(f"<p>{para.strip()}</p>")
105
+
106
+ html_parts.extend(['</body>', '</html>'])
107
+ return '\n'.join(html_parts)
108
+
109
+ else:
110
+ raise ValueError(f"Unsupported format type: {format_type}")
111
+
112
+ def split_into_snippets(text: str, chunk_size: int = 4000, overlap: int = 200) -> List[str]:
113
+ """
114
+ Split text into overlapping chunks that fit within model context windows.
115
+
116
+ Args:
117
+ text: Input text to split
118
+ chunk_size: Maximum size of each chunk
119
+ overlap: Number of characters to overlap between chunks
120
+
121
+ Returns:
122
+ List of text snippets
123
+
124
+ Raises:
125
+ ValueError: If chunk_size is too small or text is empty
126
+ """
127
+ if not text:
128
+ raise ValueError("Input text is empty")
129
+
130
+ if chunk_size < 1000:
131
+ raise ValueError("Chunk size must be at least 1000 characters")
132
+
133
+ # Split into paragraphs first
134
+ paragraphs = text.split('\n\n')
135
+ chunks = []
136
+ current_chunk = []
137
+ current_size = 0
138
+
139
+ for para in paragraphs:
140
+ para_size = len(para)
141
+
142
+ if current_size + para_size <= chunk_size:
143
+ current_chunk.append(para)
144
+ current_size += para_size + 2 # +2 for newlines
145
+ else:
146
+ if current_chunk:
147
+ chunks.append('\n\n'.join(current_chunk))
148
+
149
+ # Start new chunk with overlap
150
+ if chunks:
151
+ overlap_text = chunks[-1][-overlap:] if overlap > 0 else ""
152
+ current_chunk = [overlap_text, para]
153
+ current_size = len(overlap_text) + para_size + 2
154
+ else:
155
+ current_chunk = [para]
156
+ current_size = para_size
157
+
158
+ # Add the last chunk if it exists
159
+ if current_chunk:
160
+ chunks.append('\n\n'.join(current_chunk))
161
+
162
+ return chunks
163
+
164
+ def build_prompts(chunks: List[str], custom_prompt: Optional[str] = None) -> List[str]:
165
+ """
166
+ Build formatted prompts for each text chunk.
167
+
168
+ Args:
169
+ chunks: List of text chunks
170
+ custom_prompt: Optional custom instruction
171
+
172
+ Returns:
173
+ List of formatted prompt strings
174
+ """
175
+ default_prompt = """Please analyze and summarize the following text. Focus on:
176
+ 1. Key points and main ideas
177
+ 2. Important details and supporting evidence
178
+ 3. Any conclusions or recommendations
179
+
180
+ Please maintain the original meaning while being concise."""
181
+
182
+ instruction = custom_prompt if custom_prompt else default_prompt
183
+ prompts = []
184
+
185
+ for i, chunk in enumerate(chunks, 1):
186
+ prompt = f"""### Instruction
187
+ {instruction}
188
+
189
+ ### Input Text (Part {i} of {len(chunks)})
190
+ {chunk}
191
+
192
+ ### End of Input Text
193
+
194
+ Please provide your summary below:"""
195
+ prompts.append(prompt)
196
+
197
+ return prompts
198
+
199
+ def process_with_model(
200
+ prompt: str,
201
+ model_choice: str,
202
+ api_key: Optional[str] = None,
203
+ oauth_token: Optional[str] = None
204
+ ) -> str:
205
+ """
206
+ Process text with selected model.
207
+
208
+ Args:
209
+ prompt: Input prompt
210
+ model_choice: Selected model name
211
+ api_key: OpenAI API key for GPT models
212
+ oauth_token: Hugging Face token for other models
213
+
214
+ Returns:
215
+ Generated summary
216
+
217
+ Raises:
218
+ ValueError: If required credentials are missing
219
+ RuntimeError: If model processing fails
220
+ """
221
+ try:
222
+ if 'gpt' in model_choice.lower():
223
+ if not api_key:
224
+ raise ValueError("OpenAI API key required for GPT models")
225
+
226
+ openai.api_key = api_key
227
+ response = openai.ChatCompletion.create(
228
+ model="gpt-3.5-turbo" if "3.5" in model_choice else "gpt-4",
229
+ messages=[{"role": "user", "content": prompt}],
230
+ temperature=0.7,
231
+ max_tokens=1500
232
+ )
233
+ return response.choices[0].message.content
234
+
235
+ else: # Hugging Face models
236
+ if not oauth_token:
237
+ raise ValueError("Hugging Face token required")
238
+
239
+ headers = {"Authorization": f"Bearer {oauth_token}"}
240
+
241
+ # Map model choice to actual model ID
242
+ model_map = {
243
+ "Claude-3": "anthropic/claude-3-opus-20240229",
244
+ "Mistral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
245
+ }
246
+
247
+ model_id = model_map.get(model_choice)
248
+ if not model_id:
249
+ raise ValueError(f"Unknown model: {model_choice}")
250
+
251
+ response = requests.post(
252
+ f"https://api-inference.huggingface.co/models/{model_id}",
253
+ headers=headers,
254
+ json={"inputs": prompt}
255
+ )
256
+
257
+ if response.status_code != 200:
258
+ raise RuntimeError(f"Model API error: {response.text}")
259
+
260
+ return response.json()[0]["generated_text"]
261
+
262
+ except Exception as e:
263
+ logger.error(f"Model processing failed: {str(e)}")
264
+ raise RuntimeError(f"Failed to process with model: {str(e)}")
265
+
266
+ def validate_api_keys(openai_key: Optional[str] = None, hf_token: Optional[str] = None) -> Dict[str, bool]:
267
+ """
268
+ Validate API keys for different services.
269
+
270
+ Args:
271
+ openai_key: OpenAI API key
272
+ hf_token: Hugging Face token
273
+
274
+ Returns:
275
+ Dictionary with validation results
276
+ """
277
+ results = {"openai": False, "huggingface": False}
278
+
279
+ if openai_key:
280
+ try:
281
+ openai.api_key = openai_key
282
+ openai.Model.list()
283
+ results["openai"] = True
284
+ except:
285
+ pass
286
+
287
+ if hf_token:
288
+ try:
289
+ response = requests.get(
290
+ "https://huggingface.co/api/models",
291
+ headers={"Authorization": f"Bearer {hf_token}"}
292
+ )
293
+ results["huggingface"] = response.status_code == 200
294
+ except:
295
+ pass
296
+
297
+ return results