from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from llama_cpp import Llama def extract_text_from_pdf(pdf_path): loader = PyPDFLoader(pdf_path) pages = loader.load_and_split() all_text = " ".join([page.page_content for page in pages]) start_index = all_text.find("ABSTRACT") end_index = all_text.find("REFERENCES") if start_index != -1 and end_index != -1 and start_index < end_index: relevant_text = all_text[start_index:end_index] else: relevant_text = "Unable to locate the specified sections in the document." text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50) text_list = text_splitter.split_text(relevant_text) research_paper_text = "".join(text_list) length_of_research_paper = len(research_paper_text) return research_paper_text, length_of_research_paper def load_llm_model(): try: llm = Llama.from_pretrained( repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF", filename="Llama-3.2-1B-Instruct-Q8_0.gguf", n_ctx=50000, n_batch=16384, verbose=False, ) print("LLM model loaded successfully") return llm except Exception as e: print(f"Error loading LLM model: {e}") raise