ltu-chat / data_processor.py
Stepan
Init
4717959
import json
import os
from bs4 import BeautifulSoup
import logging
from typing import List, Dict, Any
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack import Document
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_json_data(file_path: str) -> List[Dict[str, str]]:
"""
Load data from a JSON file.
Args:
file_path: Path to the JSON file
Returns:
List of dictionaries containing the data
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Successfully loaded {len(data)} records from {file_path}")
return data
except Exception as e:
logger.error(f"Error loading JSON data: {e}")
return []
def extract_text_from_html(html_content: str) -> str:
"""
Extract text content from HTML.
Args:
html_content: HTML content as string
Returns:
Extracted text content
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text(separator=' ', strip=True)
# Remove extra whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
logger.error(f"Error extracting text from HTML: {e}")
return ""
def process_documents(data: List[Dict[str, str]]) -> List[Dict[str, Any]]:
"""
Process documents from the dataset.
Args:
data: List of dictionaries containing url and html fields
Returns:
List of processed documents with text content
"""
processed_docs = []
for i, item in enumerate(data):
try:
url = item.get('url', '')
content = item.get('content', '')
if not url or not content:
continue
# text = extract_text_from_html(html)
# text = html
# if not text:
# continue
# Create document with metadata
doc = {
'content': content,
'meta': {
'url': url,
'doc_id': f"doc_{i}"
}
}
processed_docs.append(doc)
except Exception as e:
logger.error(f"Error processing document {i}: {e}")
logger.info(f"Successfully processed {len(processed_docs)} documents")
return processed_docs
def split_documents(docs: List[Dict[str, Any]], chunk_size: int = 500, overlap: int = 50) -> List[Dict[str, Any]]:
"""
Split documents into smaller chunks for better retrieval using Haystack.
Args:
docs: List of processed documents
chunk_size: Size of each chunk in characters
overlap: Overlap between chunks in characters
Returns:
List of document chunks
"""
# Initialize Haystack document splitter
document_splitter = DocumentSplitter(
# split_by="character",
split_length=chunk_size,
split_overlap=overlap
)
chunked_docs = []
for doc in docs:
# If content is shorter than chunk_size, keep as is
if len(doc['content']) <= chunk_size:
chunked_docs.append(doc)
continue
# Prepare document for Haystack splitter
haystack_doc = Document(
content=doc['content'],
meta=doc['meta']
)
# Split the document
result = document_splitter.run(documents=[haystack_doc])
split_docs = result["documents"]
# Update document IDs for the chunks
for i, split_doc in enumerate(split_docs):
split_doc.meta["doc_id"] = f"{doc['meta']['doc_id']}_chunk_{i}"
split_doc.meta["chunk_id"] = i
chunked_docs.append(split_doc)
logger.info(f"Split {len(docs)} documents into {len(chunked_docs)} chunks")
return chunked_docs
if __name__ == "__main__":
# Test the functions
data_path = "ltu_programme_data.json"
if os.path.exists(data_path):
data = load_json_data(data_path)
processed_docs = process_documents(data[:5]) # Process first 5 docs as a test
chunked_docs = split_documents(processed_docs)
print(f"Processed {len(processed_docs)} documents into {len(chunked_docs)} chunks")