|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from llama_cpp import Llama |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
loader = PyPDFLoader(pdf_path) |
|
pages = loader.load_and_split() |
|
all_text = " ".join([page.page_content for page in pages]) |
|
start_index = all_text.find("ABSTRACT") |
|
end_index = all_text.find("REFERENCES") |
|
if start_index != -1 and end_index != -1 and start_index < end_index: |
|
relevant_text = all_text[start_index:end_index] |
|
else: |
|
relevant_text = "Unable to locate the specified sections in the document." |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50) |
|
text_list = text_splitter.split_text(relevant_text) |
|
research_paper_text = "".join(text_list) |
|
length_of_research_paper = len(research_paper_text) |
|
return research_paper_text, length_of_research_paper |
|
|
|
def load_llm_model(): |
|
try: |
|
llm = Llama.from_pretrained( |
|
repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF", |
|
filename="Llama-3.2-1B-Instruct-Q8_0.gguf", |
|
n_ctx=50000, |
|
n_batch=16384, |
|
verbose=False, |
|
) |
|
print("LLM model loaded successfully") |
|
return llm |
|
except Exception as e: |
|
print(f"Error loading LLM model: {e}") |
|
raise |