Yoxas commited on
Commit
2ea93fe
·
verified ·
1 Parent(s): 1f6f397

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -131
app.py CHANGED
@@ -1,132 +1,40 @@
1
- import os
2
- import re
3
- import torch
4
  import pandas as pd
5
- from PyPDF2 import PdfReader
6
- from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
7
- from gradio import Interface, File
8
- import gradio as gr
9
- import spaces
10
-
11
- # Load the tokenizer and model
12
- led_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
13
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
14
-
15
- # Load the model separately
16
- model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3")
17
-
18
- # Move the model to CUDA if available
19
- if torch.cuda.is_available():
20
- model = model.to("cuda")
21
-
22
- # Function to clean text by keeping only alphanumeric characters and spaces
23
- def clean_text(text):
24
- return re.sub(r'[^a-zA-Z0-9\s]', '', text)
25
-
26
- # Function to extract text from PDF files
27
- def extract_text(pdf_file):
28
- try:
29
- pdf_reader = PdfReader(pdf_file)
30
- if pdf_reader.is_encrypted:
31
- print(f"Skipping encrypted file: {pdf_file}")
32
- return None
33
- text = ''
34
- for page in pdf_reader.pages:
35
- text += page.extract_text() or ''
36
- return text
37
- except Exception as e:
38
- print(f"Error extracting text from {pdf_file}: {e}")
39
- return None
40
-
41
- # Function to split text into chunks of a specified size
42
- def split_text(text, chunk_size=1024):
43
- words = text.split()
44
- for i in range(0, len(words), chunk_size):
45
- yield ' '.join(words[i:i + chunk_size])
46
-
47
- # Function to classify text using LED model
48
- @spaces.GPU(duration=120)
49
- def classify_text(text):
50
- try:
51
- return classifier(text)[0]['label']
52
- except IndexError:
53
- return "Unable to classify"
54
-
55
- # Function to summarize text using the summarizer model
56
- @spaces.GPU(duration=120)
57
- def summarize_text(text, max_length=100, min_length=30):
58
- try:
59
- return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
60
- except IndexError:
61
- return "Unable to summarize"
62
-
63
- # Function to extract a title-like summary from the beginning of the text
64
- @spaces.GPU(duration=120)
65
- def extract_title(text, max_length=20):
66
- try:
67
- return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
68
- except IndexError:
69
- return "Unable to extract title"
70
-
71
- # Define the folder path and CSV file path
72
- # output_folder_path = '/content/drive/My Drive/path_to_output' # Adjust this to your actual path
73
-
74
- # Define the Gradio interface for file upload and download
75
- @spaces.GPU(duration=120)
76
- def process_files(pdf_files):
77
- data = []
78
- for pdf_file in pdf_files:
79
- text = extract_text(pdf_file)
80
-
81
- # Skip encrypted files
82
- if text is None:
83
- continue
84
-
85
- # Extract a title from the beginning of the text
86
- title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
87
- title = extract_title(title_text)
88
-
89
- # Initialize placeholders for combined results
90
- combined_abstract = []
91
- combined_cleaned_text = []
92
-
93
- # Split text into chunks and process each chunk
94
- for chunk in split_text(text, chunk_size=512):
95
- # Summarize the text chunk
96
- abstract = summarize_text(chunk)
97
- combined_abstract.append(abstract)
98
-
99
- # Clean the text chunk
100
- cleaned_text = clean_text(chunk)
101
- combined_cleaned_text.append(cleaned_text)
102
-
103
- # Combine results from all chunks
104
- final_abstract = ' '.join(combined_abstract)
105
- final_cleaned_text = ' '.join(combined_cleaned_text)
106
-
107
- # Append the data to the list
108
- data.append([title, final_abstract, final_cleaned_text])
109
-
110
- # Create a DataFrame from the data list
111
- df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
112
-
113
- # Save the DataFrame to a CSV file
114
- output_file_path = 'processed_pdfs.csv'
115
- df.to_csv(output_file_path, index=False)
116
-
117
- return output_file_path
118
-
119
- # Gradio interface
120
- pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
121
- csv_output = gr.File(label="Download CSV")
122
-
123
- gr.Interface(
124
- fn=process_files,
125
- inputs=pdf_input,
126
- outputs=csv_output,
127
- title="Dataset creation",
128
- description="Upload PDF files and get a summarized CSV file.",
129
- article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
130
- <p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
131
- <p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
132
- ).launch(share=True)
 
1
+ import gradio as gr
 
 
2
  import pandas as pd
3
+ from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
4
+
5
+ # Load the tokenizer and retriever
6
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
7
+ retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
8
+
9
+ # Load the model
10
+ model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
11
+
12
+ # Tokenize the contexts and responses
13
+ inputs = tokenizer(contexts, return_tensors='pt', padding=True, truncation=True)
14
+ labels = tokenizer(responses, return_tensors='pt', padding=True, truncation=True)
15
+
16
+ # Load your dataset
17
+ df = pd.read_csv('your_dataset.csv')
18
+
19
+ # Ensure the dataset has the required columns for RAG
20
+ # For example, it should have 'context' and 'response' columns
21
+ contexts = df['Abstract'].tolist()
22
+ #responses = df['response'].tolist()
23
+
24
+ def generate_response(input_text):
25
+ input_ids = tokenizer([input_text], return_tensors='pt')['input_ids']
26
+ outputs = model.generate(input_ids)
27
+ response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
28
+ return response
29
+
30
+ # Create the Gradio interface
31
+ iface = gr.Interface(
32
+ fn=generate_response,
33
+ inputs="text",
34
+ outputs="text",
35
+ title="RAG Chatbot",
36
+ description="A chatbot powered by Retrieval-Augmented Generation (RAG) model."
37
+ )
38
+
39
+ # Launch the interface
40
+ iface.launch()