RJ1200 commited on
Commit
eb985bd
·
verified ·
1 Parent(s): da391fd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+ from langchain.vectorstores import FAISS
3
+ import faiss
4
+ import pickle
5
+ import torch
6
+
7
+ from tqdm import tqdm
8
+ from langchain.docstore import InMemoryDocstore
9
+ from sentence_transformers import SentenceTransformer
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
11
+ from ragatouille import RAGPretrainedModel
12
+ from typing import List, Tuple, Optional
13
+ from langchain.docstore.document import Document as LangchainDocument
14
+
15
+
16
+ # Model initialization
17
+ READER_MODEL_NAME = "RJ1200/llama-3_3b-fine_tuned"
18
+ bnb_config = BitsAndBytesConfig(
19
+ load_in_4bit=True,
20
+ bnb_4bit_use_double_quant=True,
21
+ bnb_4bit_quant_type="nf4",
22
+ bnb_4bit_compute_dtype=torch.bfloat16,
23
+ )
24
+
25
+ model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
26
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
27
+
28
+ READER_LLM = pipeline(
29
+ model=model,
30
+ tokenizer=tokenizer,
31
+ task="text-generation",
32
+ do_sample=True,
33
+ temperature=0.2,
34
+ repetition_penalty=1.1,
35
+ return_full_text=False,
36
+ max_new_tokens=1000,
37
+ )
38
+
39
+ # Initialize reranker
40
+ RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
41
+
42
+ prompt_in_chat_format = [
43
+ {
44
+ "role": "system",
45
+ "content": """You are an AI assistant specializing in analyzing PDF documents. Your task is to generate a comprehensive question paper based on the provided PDF context.
46
+ For each section mentioned, generate the exact number of questions as specified.
47
+ Ensure that the questions are relevant, clear, and cover the key topics within the section.
48
+ Reference specific page numbers or sections from the PDF whenever applicable.
49
+ If the information needed to create questions is not available in the PDF context, clearly state that.
50
+ """,
51
+ },
52
+ {
53
+ "role": "user",
54
+ "content": """PDF Context:
55
+ {context}
56
+ ---
57
+ For the following sections, generate the required number of questions:
58
+ section_requirements
59
+ part A-10,
60
+ part B- 5,
61
+ part C- 4
62
+
63
+ ---
64
+ Question: {question}""",
65
+ },
66
+ ]
67
+
68
+ RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
69
+ prompt_in_chat_format, tokenize=False, add_generation_prompt=True
70
+ )
71
+
72
+ def answer_with_rag(
73
+ question: str,
74
+ llm: pipeline,
75
+ knowledge_index: FAISS,
76
+ reranker: Optional[RAGPretrainedModel] = None,
77
+ num_retrieved_docs: int = 30,
78
+ num_docs_final: int = 5,
79
+ ) -> Tuple[str, List[str]]:
80
+ # Gather documents with retriever
81
+ relevant_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=num_retrieved_docs)
82
+ relevant_docs = [doc.page_content for doc in relevant_docs] # Keep only the text
83
+
84
+ # Optionally rerank results
85
+ if reranker:
86
+ relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
87
+ relevant_docs = [doc["content"] for doc in relevant_docs]
88
+ relevant_docs = relevant_docs[:num_docs_final]
89
+
90
+ # Build the final prompt
91
+ context = "\nExtracted PDF content:\n"
92
+ context += "".join([f"Section {str(i+1)}:::\n" + doc for i, doc in enumerate(relevant_docs)])
93
+ final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)
94
+
95
+ # Generate an answer
96
+ answer = llm(final_prompt)[0]["generated_text"]
97
+ return answer, relevant_docs
98
+
99
+ def generate_questions(context: str):
100
+ question = "generate end-sem question paper?"
101
+ answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)
102
+ return answer
103
+
104
+ # Gradio interface
105
+ with gr.Blocks() as interface:
106
+ gr.Markdown("""
107
+ # C Question Paper Generator
108
+
109
+ """)
110
+
111
+ with gr.Row():
112
+ context_input = gr.Textbox(label="Enter Prompt", placeholder="prompt", lines=1)
113
+
114
+ generate_button = gr.Button("Generate Questions")
115
+ output_text = gr.Textbox(label="Generated Questions", lines=20)
116
+
117
+ generate_button.click(generate_questions, inputs=[context_input], outputs=[output_text])
118
+
119
+ interface.launch()